Пример #1
0
def main(model_dump_file):
    model = data.load_model(model_dump_file)
    testing_set = data.preprocess_testing_set()
    fpath = os.path.join("submit", "submit.csv")
    with open(fpath, 'w') as f:
        f.write("ImageId,Label")
        for i in range(len(testing_set)):
            nn.forward(model, testing_set[i], is_test_time=True)
            f.write("\n%d,%d" % (i + 1, np.argmax(model['score'])))
Пример #2
0
def main(epoch, rate, reg, decay, continue_at, batch_size):
    input_size = 784
    hidden_layer_size = 200
    output_size = 10
    if (continue_at and os.path.exists(continue_at)):
        model = data.load_model(continue_at)
    else:
        model = nn.init_model(input_size, hidden_layer_size, output_size, reg)

    # epoch = 20;
    # base_learning_rate = 1e-5;
    base_learning_rate = rate
    decay_schedule = nn.decay_schedule(epoch, decay)
    learning_rate = base_learning_rate * decay_schedule
    print(learning_rate)

    precision_curve = plot.plot()
    loss_curve = plot.plot()
    for ep in range(epoch):
        lr = learning_rate[ep]
        training_set = data.preprocess_training_set()
        print("training epoch %d/%d with learning rate %g" %
              (ep + 1, epoch, lr))
        batches = nn.sample_batches(training_set, batch_size)
        yes = 0
        cnt = 0
        for batch in batches:
            for item in batch:
                label, img = item
                nn.forward(model, img, is_test_time=False)
                prob = model['score'].copy()
                prob -= np.max(prob)
                prob = np.exp(prob) / np.sum(np.exp(prob))
                dz = prob.copy()
                dz[label] -= 1
                nn.sgd_backward(model, dz, batch_size)

                predict = np.argmax(model['score'])
                yes += (predict == label)
                cnt += 1
                if (cnt % 1000 == 0):
                    loss_curve.append(-np.log(prob[label]))
                    print("[%d/%d]: %0.2f%%" % (yes, cnt, yes / cnt * 100),
                          end='\r')
            nn.update_weights(model, lr)
        precision_curve.append(yes / cnt)
        precision_curve.save("precision.jpg")
        loss_curve.save("loss.jpg")
        data.save_model(model)
        print("\nmodel saved\n")
Пример #3
0
def accuracy(nn, data, label, thr=0.5):
    predict = [
        np.int8(nnDif.forward(nn, data[c, :]) > thr) == label[c]
        for c in range(data.shape[0])
    ]
    return 100 * np.double(len(
        np.where(np.asarray(predict) == False)[0])) / np.double(len(predict))
Пример #4
0
 def recurrence_all_path(prev, obs):
     tags_score = obs[0]
     word_pos_ = obs[1] + 1
     tags_score_slice = tags_score[0:word_pos_, :]
     s_len = tf.shape(tags_score_slice)[0]
     obvs = tf.concat(
         [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
     observations = tf.concat([b_s, obvs, e_s], axis=0)
     all_paths_scores = forward(observations, transitions)
     return tf.reshape(all_paths_scores, [])
Пример #5
0
def accuracy(nn, data, label, thr = 0.5):
    
    predict  = [ np.int8(nnDif.forward(nn,data[c,:]) > thr) == label[c] for c in range(data.shape[0])]
    plotArrX = []
    plotArrY = []
    for pnt in data:
      plotArrX.append(pnt[0])
      plotArrY.append(pnt[1])
    
 
    return 100 * np.double(len(np.where(np.asarray(predict)==False)[0]))/np.double(len(predict))
Пример #6
0
 def sentVitebe(i, predictTag, scores, transitions, lenVec):
     #{{{
     Len = lenVec[i]
     accLen = lenVec[:i].sum()
     currentTagsScores = scores[accLen:accLen + Len]
     currentPredictIds = forward(currentTagsScores,
                                 transitions,
                                 viterbi=True,
                                 return_alpha=False,
                                 return_best_sequence=True)
     predictTag = T.set_subtensor(
         predictTag[accLen:accLen + Len], currentPredictIds)
     return predictTag
Пример #7
0
def accuracy(nn, data, label, thr=0.5):

    predict = [
        np.int8(nnDif.forward(nn, data[c, :]) > thr) == label[c]
        for c in range(data.shape[0])
    ]
    plotArrX = []
    plotArrY = []
    for pnt in data:
        plotArrX.append(pnt[0])
        plotArrY.append(pnt[1])

    return 100 * np.double(len(
        np.where(np.asarray(predict) == False)[0])) / np.double(len(predict))
Пример #8
0
                def sentLoss(i, scores, trueIds, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentIds = trueIds[accLen:accLen + Len]
                    real_path_score = currentTagsScores[T.arange(Len),
                                                        currentIds].sum()
                    # Score from transitions
                    padded_tags_ids = T.concatenate([[n_tags], currentIds],
                                                    axis=0)
                    real_path_score += transitions[
                        padded_tags_ids[T.arange(Len)],
                        padded_tags_ids[T.arange(Len) + 1]].sum()

                    all_paths_scores = forward(currentTagsScores, transitions)
                    cost = -(real_path_score - all_paths_scores)
                    return cost
Пример #9
0
 def recurrence_predict(prev, obs):
     tags_score = obs[0]
     word_pos_ = obs[1] + 1
     tags_score_slice = tags_score[0:word_pos_, :]
     s_len = tf.shape(tags_score_slice)[0]
     obvs = tf.concat(
         [tags_score_slice, small * tf.ones((s_len, 2))], axis=1)
     observations = tf.concat([b_s, obvs, e_s], axis=0)
     all_paths_scores = forward(observations,
                                transitions,
                                viterbi=True,
                                return_alpha=False,
                                return_best_sequence=True)
     all_paths_scores = tf.concat([
         all_paths_scores,
         tf.zeros([tf.shape(tags_score)[0] - s_len], tf.int32)
     ],
                                  axis=0)
     return all_paths_scores
Пример #10
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              word_to_id=None,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 6

        if self.parameters['pos_dim']:
            n_pos = len(self.id_to_pos)
        if self.parameters['ortho_dim']:
            n_ortho = len(self.id_to_ortho)
        if self.parameters['multi_task']:
            n_segment_tags = len(self.id_to_segment)
        if self.parameters['pre_emb_1_dim']:
            n_words_1 = len(self.id_to_word_1)

            # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')
        if self.parameters['pos_dim']:
            pos_ids = T.ivector(name='pos_ids')
        if self.parameters['ortho_dim']:
            ortho_ids = T.ivector(name='ortho_ids')
        if self.parameters['multi_task']:
            segment_tags_ids = T.ivector(name='segment_tags_ids')
        if self.parameters['pre_emb_1_dim']:
            word_ids_1 = T.ivector(name='doc_ids_dn')
        if self.parameters['language_model']:
            y_fwd_ids = T.ivector(name='y_fwd_ids')
            y_bwd_ids = T.ivector(name='y_bwd_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            print('word_dim: {}'.format(word_dim))
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training and not self.parameters['reload']:
                new_weights = word_layer.embeddings.get_value()
                print(
                    'Loading pretrained embeddings from {}...'.format(pre_emb))
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print('WARNING: {} invalid lines'.format(emb_invalid))
                c_found = 0
                c_lower = 0
                c_zeros = 0
                oov_words = 0
                if self.parameters['emb_of_unk_words']:
                    # TODO
                    # add path as a parameter
                    fast_text_model_p = '/home/ubuntu/usama_ws/resources/Spanish-Corporas/embeddings/fasttext/' \
                                        'fasttext-100d.bin'
                    ft_model = load_model(fast_text_model_p)
                # Lookup table initialization
                for i in range(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                    else:
                        if self.parameters['emb_of_unk_words']:
                            new_weights[i] = ft_model.get_word_vector(word)
                        oov_words += 1

                # set row corresponding to padding token to 0
                new_weights[word_to_id['<PADDING>']] = np.zeros(word_dim)

                word_layer.embeddings.set_value(new_weights)
                print('Loaded {} pretrained embeddings.'.format(
                    len(pretrained)))
                print('{} / {} ({} percent) words have been initialized with '
                      'pretrained embeddings.'.format(
                          c_found + c_lower + c_zeros, n_words,
                          100. * (c_found + c_lower + c_zeros) / n_words))
                print('{} found directly, {} after lowercasing, '
                      '{} after lowercasing + zero.'.format(
                          c_found, c_lower, c_zeros))
                print('oov words count: {}'.format(oov_words))

        #
        # Word inputs
        #
        if self.parameters['pre_emb_1']:
            print('pre_emb_1_dim: {}'.format(self.parameters['pre_emb_1_dim']))
            input_dim += self.parameters['pre_emb_1_dim']
            word_layer_1 = EmbeddingLayer(n_words_1,
                                          word_dim,
                                          name='word_layer_1')
            word_input_1 = word_layer_1.link(word_ids_1)
            inputs.append(word_input_1)
            if training and not self.parameters['reload']:
                # Initialize with pretrained embeddings
                new_weights_1 = word_layer_1.embeddings.get_value()
                print('Loading pretrained embeddings from {}...'.format(
                    self.parameters['pre_emb_1']))
                pretrained_1 = {}
                emb_invalid_1 = 0
                for i, line in enumerate(
                        codecs.open(self.parameters['pre_emb_1'], 'r',
                                    'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == self.parameters['pre_emb_1_dim'] + 1:
                        pretrained_1[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid_1 += 1
                if emb_invalid_1 > 0:
                    print('WARNING: {} invalid lines'.format(emb_invalid_1))
                c_found = 0
                c_lower = 0
                c_zeros = 0
                oov_words = 0
                # Lookup table initialization
                for i in range(n_words_1):
                    word_1 = self.id_to_word_1[i]
                    if word_1 in pretrained_1:
                        new_weights_1[i] = pretrained_1[word_1]
                        c_found += 1
                    elif word_1.lower() in pretrained_1:
                        new_weights_1[i] = pretrained_1[word_1.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word_1.lower()) in pretrained_1:
                        new_weights_1[i] = pretrained_1[re.sub(
                            '\d', '0', word_1.lower())]
                        c_zeros += 1
                    else:
                        oov_words += 1

                word_layer_1.embeddings.set_value(new_weights_1)
                print('Loaded {} pretrained embeddings.'.format(
                    len(pretrained_1)))
                print('{} / {} ({} percent) words have been initialized with '
                      'pretrained embeddings.'.format(
                          c_found + c_lower + c_zeros, n_words,
                          100. * (c_found + c_lower + c_zeros) / n_words))
                print('{} found directly, {} after lowercasing, '
                      '{} after lowercasing + zero.'.format(
                          c_found, c_lower, c_zeros))
                print('oov words count: {}'.format(oov_words))

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))
        if self.parameters['pos_dim']:
            input_dim += self.parameters['pos_dim']
            pos_layer = EmbeddingLayer(n_pos,
                                       self.parameters['pos_dim'],
                                       name='pos_layer')
            inputs.append(pos_layer.link(pos_ids))
            # zeroing the '<UNK>' pos tag row
            # loading reverse mappings
            pos_to_id = {y: x for x, y in self.id_to_pos.items()}
            unk_idx = pos_to_id['<UNK>']
            _pos_wts = pos_layer.embeddings.get_value()
            _pos_wts[unk_idx] = [0.] * self.parameters['pos_dim']
            pos_layer.embeddings.set_value(_pos_wts)
        if self.parameters['ortho_dim']:
            input_dim += self.parameters['ortho_dim']
            ortho_layer = EmbeddingLayer(n_ortho,
                                         self.parameters['ortho_dim'],
                                         name='ortho_layer')
            inputs.append(ortho_layer.link(ortho_ids))
            ortho_to_id = {y: x for x, y in self.id_to_ortho.items()}
            unk_idx = ortho_to_id['<UNK>']
            _pos_wts = ortho_layer.embeddings.get_value()
            _pos_wts[unk_idx] = [0.] * self.parameters['ortho_dim']
            ortho_layer.embeddings.set_value(_pos_wts)

        print('input_dim: {}'.format(input_dim))
        # Prepare final input
        inputs = T.concatenate(inputs,
                               axis=1) if len(inputs) != 1 else inputs[0]
        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            n_h = 2 * word_lstm_dim
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(n_h,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)
        if self.parameters['multi_task']:
            # Sentence to Named Entity Segmentation tags - Score
            segment_layer = HiddenLayer(
                word_lstm_dim,
                n_segment_tags,
                name='segment_layer',
                activation=(None if crf else 'softmax'))
            segment_tags_scores = segment_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
            if self.parameters['multi_task']:
                cost_segment = T.nnet.categorical_crossentropy(
                    segment_tags_scores, segment_tags_ids).mean()
                cost += cost_segment
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

            if self.parameters['multi_task']:
                segment_transitions = shared(
                    (n_segment_tags + 2, n_segment_tags + 2),
                    'segment_transitions')

                seg_small = -1000
                seg_b_s = np.array([[seg_small] * n_segment_tags +
                                    [0, seg_small]]).astype(np.float32)
                seg_e_s = np.array([[seg_small] * n_segment_tags +
                                    [seg_small, 0]]).astype(np.float32)
                segment_observations = T.concatenate(
                    [segment_tags_scores, seg_small * T.ones((s_len, 2))],
                    axis=1)
                segment_observations = T.concatenate(
                    [seg_b_s, segment_observations, seg_e_s], axis=0)

                # Score from tags
                seg_real_path_score = segment_tags_scores[
                    T.arange(s_len), segment_tags_ids].sum()

                # Score from transitions
                seg_b_id = theano.shared(
                    value=np.array([n_segment_tags], dtype=np.int32))
                seg_e_id = theano.shared(
                    value=np.array([n_segment_tags + 1], dtype=np.int32))
                seg_padded_tags_ids = T.concatenate(
                    [seg_b_id, segment_tags_ids, seg_e_id], axis=0)
                seg_real_path_score += segment_transitions[
                    seg_padded_tags_ids[T.arange(s_len + 1)],
                    seg_padded_tags_ids[T.arange(s_len + 1) + 1]].sum()

                seg_all_paths_scores = forward(segment_observations,
                                               segment_transitions)
                cost_segment = -(seg_real_path_score - seg_all_paths_scores)
                cost += cost_segment

        if training and self.parameters['ranking_loss']:

            def recurrence(x_t, y_t):
                token_prob_pos = x_t[y_t]
                arg_max_1 = T.argmax(x_t)
                arg_max_2 = T.argsort(-x_t)[1]
                token_prob_neg = ifelse(T.eq(y_t, arg_max_1), x_t[arg_max_2],
                                        x_t[arg_max_1])
                cost_t = T.max([0, 1.0 - token_prob_pos + token_prob_neg])
                return cost_t

            cost_r, _ = theano.scan(recurrence,
                                    sequences=[tags_scores, tag_ids])
            cum_cost = T.sum(cost_r)
            cost += cum_cost

        if self.parameters['language_model']:
            lm_fwd_layer = HiddenLayer(word_lstm_dim,
                                       n_words,
                                       name='lm_fwd_layer',
                                       activation='softmax')
            lm_fwd_scores = lm_fwd_layer.link(final_output)
            lm_fwd_cost = T.nnet.categorical_crossentropy(
                lm_fwd_scores, y_fwd_ids).mean()
            lm_bwd_layer = HiddenLayer(word_lstm_dim,
                                       n_words,
                                       name='lm_bwd_layer',
                                       activation='softmax')
            lm_bwd_scores = lm_bwd_layer.link(final_output)
            lm_bwd_cost = T.nnet.categorical_crossentropy(
                lm_bwd_scores, y_bwd_ids).mean()
            cost_lm = lm_fwd_cost + lm_bwd_cost
            cost += cost_lm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if self.parameters['pre_emb_1']:
            self.add_component(word_layer_1)
            params.extend(word_layer_1.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        if self.parameters['pos_dim']:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if self.parameters['ortho_dim']:
            self.add_component(ortho_layer)
            params.extend(ortho_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if self.parameters['multi_task']:
            self.add_component(segment_layer)
            params.extend(segment_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
            if self.parameters['multi_task']:
                self.add_component(segment_transitions)
                params.append(segment_transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        if self.parameters['language_model']:
            self.add_component(lm_fwd_layer)
            params.extend(lm_fwd_layer.params)
            self.add_component(lm_bwd_layer)
            params.extend(lm_bwd_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if self.parameters['pos_dim']:
            eval_inputs.append(pos_ids)
        if self.parameters['ortho_dim']:
            eval_inputs.append(ortho_ids)
        if self.parameters['pre_emb_1']:
            eval_inputs.append(word_ids_1)
        train_inputs = eval_inputs + [tag_ids]
        if self.parameters['multi_task']:
            train_inputs += [segment_tags_ids]
        if self.parameters['language_model']:
            train_inputs.append(y_fwd_ids)
            train_inputs.append(y_bwd_ids)

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print('Compiling...')
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}),
                                      allow_input_downcast=True)
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     allow_input_downcast=True)
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     allow_input_downcast=True)

        return f_train, f_eval
Пример #11
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              model_type,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        layer_weighting = "fixed"
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)

        print "-------------------------------MODEL INFO---------------------------------------"
        print "** model_type", model_type
        print "** n_words, n_chars:", n_words, n_chars
        print "** self.feature_maps:"
        for f in self.feature_maps:
            print f["name"], f
        print "** self.tag_maps:"
        for tm in self.tag_maps:
            print tm
        print "---------------------------------------------------------------------------------"

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')

        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')

        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        features_ids = []
        for f in self.feature_maps:
            features_ids.append(T.ivector(name=f['name'] + '_ids'))

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            print "** input_dim (input_dim += word_dim)", input_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            print "** input_dim (input_dim += char_lstm_dim)", input_dim

            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
                print "** input_dim (input_dim += char_lstm_dim: char_bidirect)", input_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            print "** input_dim (input_dim += cap_dim)", input_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        f_layers = []
        for ilayer in range(len(self.feature_maps)):
            f = self.feature_maps[ilayer]
            input_dim += f['dim']
            print "** input_dim (input_dim += f['dim'])", input_dim

            af_layer = EmbeddingLayer(len(f['id_to_ftag']),
                                      f['dim'],
                                      name=f['name'] + '_layer')
            f_layers.append(af_layer)
            inputs.append(af_layer.link(features_ids[ilayer]))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1)
        # inputs_nodropout = inputs

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        assert model_type in {
            "struct", "struct_mlp", "struct_mlp2", "multilayer", "single"
        }

        # Network parameters: Part 1 (Common parameters)
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)

        for af_layer in f_layers:
            self.add_component(af_layer)
            params.extend(af_layer.params)

        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)

        if model_type == "multilayer" or model_type == "single":
            tags_scores_list = []
            tag_ids_list = []
            cost_list = []

            observations_list = []
            transitions_list = []

            prev_input_dim = input_dim
            prev_ntags = 0
            prev_tags_cores = None
            previous_inputs = inputs

            for ilayer in range(len(self.tag_maps)):
                inputs_i = previous_inputs if prev_tags_cores == None else T.concatenate(
                    [previous_inputs, prev_tags_cores], axis=1)
                previous_inputs = inputs_i
                input_dim_i = prev_input_dim + prev_ntags
                print "input_dim_i for layer %d: %d" % (ilayer, input_dim_i)

                word_lstm_for_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_for' + str(ilayer))
                word_lstm_rev_i = LSTM(input_dim_i,
                                       word_lstm_dim,
                                       with_batch=False,
                                       name='word_lstm_rev' + str(ilayer))
                word_lstm_for_i.link(inputs_i)
                word_lstm_rev_i.link(inputs_i[::-1, :])
                word_for_output_i = word_lstm_for_i.h
                word_rev_output_i = word_lstm_rev_i.h[::-1, :]

                if word_bidirect:
                    final_output_i = T.concatenate(
                        [word_for_output_i, word_rev_output_i], axis=1)
                    tanh_layer_i = HiddenLayer(2 * word_lstm_dim,
                                               word_lstm_dim,
                                               name='tanh_layer' + str(ilayer),
                                               activation='tanh')
                    final_output_i = tanh_layer_i.link(final_output_i)
                else:
                    final_output_i = word_for_output_i

                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])

                final_layer_i = HiddenLayer(
                    word_lstm_dim,
                    n_tags_i,
                    name='final_layer' + str(ilayer),
                    activation=(None if crf else 'softmax'))
                tags_scores_i = final_layer_i.link(final_output_i)
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags of layer i

                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)

                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)

                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                prev_input_dim = input_dim_i
                prev_ntags = n_tags_i
                prev_tags_cores = tags_scores_i * 1

                cost_list.append(cost_i)  # add cost of layer i into cost list
                tags_scores_list.append(tags_scores_i)
                tag_ids_list.append(tag_ids_i)

                # Network parameters: Part 2 (add parameters of mutilayer architectures)

                self.add_component(word_lstm_for_i)
                params.extend(word_lstm_for_i.params)  #1

                if word_bidirect:
                    self.add_component(word_lstm_rev_i)
                    params.extend(word_lstm_rev_i.params)  #2

                self.add_component(final_layer_i)
                params.extend(final_layer_i.params)  #3

                if crf:
                    self.add_component(transitions_i)
                    params.append(transitions_i)  #4

                if word_bidirect:
                    self.add_component(tanh_layer_i)
                    params.extend(tanh_layer_i.params)  #5

            # end for loop

        elif model_type == "struct" or model_type.startswith("struct_mlp"):
            # begin step 1: Using BI-LSTM to encode the sequence

            word_lstm_for = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_for')
            word_lstm_rev = LSTM(input_dim,
                                 word_lstm_dim,
                                 with_batch=False,
                                 name='word_lstm_rev')

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_rev_output = word_lstm_rev.h[::-1, :]
            if word_bidirect:
                lstm_output = T.concatenate([word_for_output, word_rev_output],
                                            axis=1)
                tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                         word_lstm_dim,
                                         name='tanh_layer',
                                         activation='tanh')
                lstm_output = tanh_layer.link(lstm_output)
            else:
                lstm_output = word_for_output

            # end step 1: final_output is the list of hidden states. Shapes of hidden state is

            prev_ntags = 0
            tags_scores_list = []
            prev_tags_cores = None
            final_layer_list = []
            final_output = lstm_output
            mlp_list = []

            if model_type == "struct":
                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim + prev_ntags,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(final_output)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            elif model_type.startswith("struct_mlp"):

                for ilayer in range(0, len(self.tag_maps)):
                    n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                    final_output = final_output if prev_tags_cores == None else T.concatenate(
                        [final_output, prev_tags_cores], axis=1)

                    if model_type == "struct_mlp2":
                        mlp_sizes = [
                            word_lstm_dim + prev_ntags, word_lstm_dim,
                            word_lstm_dim
                        ]
                    else:
                        mlp_sizes = [word_lstm_dim + prev_ntags, word_lstm_dim]

                    mlp_input = final_output
                    for j in range(len(mlp_sizes) - 1):
                        mlp_layer = HiddenLayer(mlp_sizes[j],
                                                mlp_sizes[j + 1],
                                                name="mlp" + str(j + 1) +
                                                "_layer_" + str(ilayer),
                                                activation="tanh")
                        mlp_input = mlp_layer.link(mlp_input)
                        mlp_list.append(mlp_layer)
                    final_layer_i = HiddenLayer(
                        word_lstm_dim,
                        n_tags_i,
                        name='final_layer_' + str(ilayer),
                        activation=(None if crf else 'softmax'))
                    tags_scores_i = final_layer_i.link(mlp_input)

                    # # unroll version
                    # mlp1_layer_i = HiddenLayer(word_lstm_dim + prev_ntags, word_lstm_dim,
                    #                            name="mlp1_layer_" + str(ilayer), activation="tanh")
                    # mlp1_layer_i_out = mlp1_layer_i.link(final_output)
                    #
                    # mlp2_layer_i = HiddenLayer(word_lstm_dim, word_lstm_dim,
                    #                            name="mlp2_layer_" + str(ilayer), activation="tanh")
                    # mlp2_layer_i_out = mlp2_layer_i.link(mlp1_layer_i_out)
                    # mlp_list.append(mlp1_layer_i)
                    # mlp_list.append(mlp2_layer_i)
                    #
                    # final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer_' + str(ilayer),
                    #                             activation=(None if crf else 'softmax'))
                    # tags_scores_i = final_layer_i.link(mlp2_layer_i_out)

                    prev_ntags += n_tags_i
                    prev_tags_cores = tags_scores_i
                    tags_scores_list.append(tags_scores_i)
                    final_layer_list.append(final_layer_i)
            else:
                print(model_type, " is not exits !")
                raise

            # # unroll code
            # n_tags_0 = len(self.tag_maps[0]['id_to_tag'])
            # final_layer_0 = HiddenLayer(word_lstm_dim, n_tags_0, name='final_layer_0', activation=(None if crf else 'softmax'))
            # tags_scores_0 = final_layer_0.link(final_output)
            #
            # n_tags_1 = len(self.tag_maps[1]['id_to_tag'])
            # final_layer_1 = HiddenLayer(word_lstm_dim + n_tags_0, n_tags_1, name='final_layer_1', activation=(None if crf else 'softmax'))
            # final_output = T.concatenate( [final_output, tags_scores_0], axis=1 )
            # tags_scores_1 = final_layer_1.link(final_output)
            #
            # n_tags_2 = len(self.tag_maps[2]['id_to_tag'])
            # final_layer_2 = HiddenLayer(word_lstm_dim + n_tags_0 + n_tags_1, n_tags_2, name='final_layer_2',
            #                         activation=(None if crf else 'softmax'))
            # final_output = T.concatenate([final_output, tags_scores_1], axis=1)
            # tags_scores_2 = final_layer_2.link(final_output)
            # tags_scores_list = [tags_scores_0, tags_scores_1, tags_scores_2]

            tag_ids_list = []
            observations_list = []
            transitions_list = []
            cost_list = []

            for ilayer in range(0, len(self.tag_maps)):
                tag_ids_i = T.ivector(name='tag_ids' +
                                      str(ilayer))  # input tags
                tag_ids_list.append(tag_ids_i)
                tags_scores_i = tags_scores_list[ilayer]
                n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
                # No CRF
                if not crf:
                    cost_i = T.nnet.categorical_crossentropy(
                        tags_scores_i, tag_ids_i).mean()
                # CRF
                else:
                    transitions_i = shared((n_tags_i + 2, n_tags_i + 2),
                                           'transitions' + str(ilayer))
                    small1 = -1000
                    b_s1 = np.array([[small1] * n_tags_i + [0, small1]
                                     ]).astype(np.float32)
                    e_s1 = np.array([[small1] * n_tags_i + [small1, 0]
                                     ]).astype(np.float32)
                    observations_i = T.concatenate(
                        [tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
                    observations_i = T.concatenate(
                        [b_s1, observations_i, e_s1], axis=0)

                    # Score from tags
                    real_path_score1 = tags_scores_i[T.arange(s_len),
                                                     tag_ids_i].sum()

                    # Score from transitions
                    b_id1 = theano.shared(
                        value=np.array([n_tags_i], dtype=np.int32))
                    e_id1 = theano.shared(
                        value=np.array([n_tags_i + 1], dtype=np.int32))
                    padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1],
                                                     axis=0)
                    real_path_score1 += transitions_i[
                        padded_tags_ids1[T.arange(s_len + 1)],
                        padded_tags_ids1[T.arange(s_len + 1) + 1]].sum()

                    all_paths_scores1 = forward(observations_i, transitions_i)

                    cost_i = -(real_path_score1 - all_paths_scores1)

                    observations_list.append(observations_i)
                    transitions_list.append(transitions_i)

                cost_list.append(cost_i)  # add cost of layer i into cost list

            # Network parameters: Part 2 (add parameters of struct architectures)

            self.add_component(word_lstm_for)
            params.extend(word_lstm_for.params)

            if word_bidirect:
                self.add_component(word_lstm_rev)
                params.extend(word_lstm_rev.params)

            for mlp_layer in mlp_list:
                self.add_component(mlp_layer)
                params.extend(mlp_layer.params)

            for final_layer in final_layer_list:
                self.add_component(final_layer)
                params.extend(final_layer.params)

            # # unroll code
            # self.add_component(final_layer_0)
            # params.extend(final_layer_0.params)
            #
            # self.add_component(final_layer_1)
            # params.extend(final_layer_1.params)
            #
            # self.add_component(final_layer_2)
            # params.extend(final_layer_2.params)

            if crf:
                for transitions in transitions_list:
                    self.add_component(transitions)
                    params.append(transitions)

            if word_bidirect:
                self.add_component(tanh_layer)
                params.extend(tanh_layer.params)

        # elif model_type == "multilayer_original":
        #     print "** input_dim FOR LAYER 0 ", input_dim
        #     # LSTM for words
        #     word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_for')
        #     word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
        #                          name='word_lstm_rev')
        #
        #     word_lstm_for.link(inputs)
        #     word_lstm_rev.link(inputs[::-1, :])
        #     word_for_output = word_lstm_for.h
        #     word_rev_output = word_lstm_rev.h[::-1, :]
        #     if word_bidirect:
        #         final_output = T.concatenate(
        #             [word_for_output, word_rev_output],
        #             axis=1
        #         )
        #         tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                  name='tanh_layer', activation='tanh')
        #         final_output = tanh_layer.link(final_output)
        #     else:
        #         final_output = word_for_output
        #
        #     # Sentence to Named Entity tags - Score
        #     n_tags = len(self.tag_maps[0]['id_to_tag'])
        #
        #     final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
        #                               activation=(None if crf else 'softmax'))
        #     tags_scores = final_layer.link(final_output)
        #     tag_ids = T.ivector(name='tag_ids0')  # input tags of layer i
        #
        #     # No CRF
        #     if not crf:
        #         cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        #     # CRF
        #     else:
        #         transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
        #
        #         small = -1000
        #         b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
        #         e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
        #         observations = T.concatenate(
        #             [tags_scores, small * T.ones((s_len, 2))],
        #             axis=1
        #         )
        #         observations = T.concatenate(
        #             [b_s, observations, e_s],
        #             axis=0
        #         )
        #
        #         # Score from tags
        #         real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
        #
        #         # Score from transitions
        #         b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
        #         e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
        #         padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
        #         real_path_score += transitions[
        #             padded_tags_ids[T.arange(s_len + 1)],
        #             padded_tags_ids[T.arange(s_len + 1) + 1]
        #         ].sum()
        #
        #         all_paths_scores = forward(observations, transitions)
        #         cost = - (real_path_score - all_paths_scores)
        #
        #     print "cost: ", cost
        #     # Network parameters
        #
        #
        #     self.add_component(word_lstm_for)
        #     params.extend(word_lstm_for.params)  #1
        #
        #     if word_bidirect:
        #         self.add_component(word_lstm_rev)
        #         params.extend(word_lstm_rev.params)  #2
        #
        #     self.add_component(final_layer)
        #     params.extend(final_layer.params)  #3
        #
        #     if crf:
        #         self.add_component(transitions)
        #         params.append(transitions)  #4
        #
        #     if word_bidirect:
        #         self.add_component(tanh_layer)
        #         params.extend(tanh_layer.params)  #5
        #
        #     #
        #     #    layer 1 to n
        #     #
        #     tags_scores_list = [tags_scores]
        #     tag_ids_list = [tag_ids]
        #     cost_list = [cost]
        #     observations_list = [observations]
        #     transitions_list = [transitions]
        #     prev_input_dim = input_dim
        #     prev_ntags = n_tags
        #     prev_tags_cores = tags_scores * 1
        #
        #     for ilayer in range(1, len(self.tag_maps)):
        #         inputs_i = previous_inputs * 1
        #         inputs_i.append(prev_tags_cores)
        #         previous_inputs = inputs_i * 1
        #
        #         inputs_i = T.concatenate(inputs_i, axis=1)
        #         input_dim_i = prev_input_dim + prev_ntags
        #
        #         word_lstm_for_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_for' + str(ilayer))
        #         word_lstm_rev_i = LSTM(input_dim_i, word_lstm_dim, with_batch=False, name='word_lstm_rev' + str(ilayer))
        #         word_lstm_for_i.link(inputs_i)
        #         word_lstm_rev_i.link(inputs_i[::-1, :])
        #         word_for_output_i = word_lstm_for_i.h
        #         word_rev_output_i = word_lstm_rev_i.h[::-1, :]
        #
        #         if word_bidirect:
        #             final_output_i = T.concatenate(
        #                 [word_for_output_i, word_rev_output_i],
        #                 axis=1
        #             )
        #             tanh_layer_i = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
        #                                        name='tanh_layer' + str(ilayer), activation='tanh')
        #             final_output_i = tanh_layer_i.link(final_output_i)
        #         else:
        #             final_output_i = word_for_output_i
        #
        #         n_tags_i = len(self.tag_maps[ilayer]['id_to_tag'])
        #
        #         final_layer_i = HiddenLayer(word_lstm_dim, n_tags_i, name='final_layer' + str(ilayer),
        #                                     activation=(None if crf else 'softmax'))
        #         tags_scores_i = final_layer_i.link(final_output_i)
        #         tags_scores_list.append(tags_scores_i)
        #         tag_ids_i = T.ivector(name='tag_ids' + str(ilayer))  # input tags
        #         tag_ids_list.append(tag_ids_i)
        #
        #         # No CRF
        #         if not crf:
        #             cost_i = T.nnet.categorical_crossentropy(tags_scores_i, tag_ids_i).mean()
        #         # CRF
        #         else:
        #             transitions_i = shared((n_tags_i + 2, n_tags_i + 2), 'transitions' + str(ilayer))
        #             small1 = -1000
        #             b_s1 = np.array([[small1] * n_tags_i + [0, small1]]).astype(np.float32)
        #             e_s1 = np.array([[small1] * n_tags_i + [small1, 0]]).astype(np.float32)
        #             observations_i = T.concatenate([tags_scores_i, small1 * T.ones((s_len, 2))], axis=1)
        #             observations_i = T.concatenate([b_s1, observations_i, e_s1], axis=0)
        #
        #             # Score from tags
        #             real_path_score1 = tags_scores_i[T.arange(s_len), tag_ids_i].sum()
        #
        #             # Score from transitions
        #             b_id1 = theano.shared(value=np.array([n_tags_i], dtype=np.int32))
        #             e_id1 = theano.shared(value=np.array([n_tags_i + 1], dtype=np.int32))
        #             padded_tags_ids1 = T.concatenate([b_id1, tag_ids_i, e_id1], axis=0)
        #             real_path_score1 += transitions_i[
        #                 padded_tags_ids1[T.arange(s_len + 1)],
        #                 padded_tags_ids1[T.arange(s_len + 1) + 1]
        #             ].sum()
        #
        #             all_paths_scores1 = forward(observations_i, transitions_i)
        #
        #             cost_i = - (real_path_score1 - all_paths_scores1)
        #
        #             observations_list.append(observations_i)
        #             transitions_list.append(transitions_i)
        #
        #         prev_input_dim = input_dim_i
        #         prev_ntags = n_tags_i
        #         prev_tags_cores = tags_scores_i * 1
        #         cost_list.append(cost_i)  # add cost of layer i into cost list
        #
        #         # add parameters
        #
        #         self.add_component(word_lstm_for_i)
        #         params.extend(word_lstm_for_i.params)
        #
        #         if word_bidirect:
        #             self.add_component(word_lstm_rev_i)
        #             params.extend(word_lstm_rev_i.params)
        #
        #         self.add_component(final_layer_i)
        #         params.extend(final_layer_i.params)
        #
        #         if crf:
        #             self.add_component(transitions_i)
        #             params.append(transitions_i)
        #
        #         if word_bidirect:
        #             self.add_component(tanh_layer_i)
        #             params.extend(tanh_layer_i.params)
        #
        #     # end for loop

        if layer_weighting == "fixed":
            if len(self.tag_maps) == 2:
                cost_weights = np.array([0.4, 0.6])
            elif len(self.tag_maps) == 3:
                cost_weights = np.array([0.4, 0.3, 0.3])
            else:
                cost_weights = np.ones(
                    (len(self.tag_maps), )) / len(self.tag_maps)
            costall = np.sum(cost_weights * np.array(cost_list))

        else:
            # https://groups.google.com/forum/#!topic/theano-users/XDG6MM83grI
            weights = np.ones((len(self.tag_maps), )) / len(self.tag_maps)
            cost_weights = theano.shared(weights.astype(theano.config.floatX),
                                         name="layer_weights")
            layer_weights = theano.tensor.nnet.sigmoid(cost_weights)
            params.extend([cost_weights])
            xx = theano.tensor.mul(layer_weights,
                                   theano.tensor.as_tensor_variable(cost_list))
            costall = theano.tensor.sum(xx)

        # Prepare train and eval inputs
        eval_inputs = []

        if word_dim:
            eval_inputs.append(word_ids)

        for ilayer in range(len(self.feature_maps)):
            eval_inputs.append(features_ids[ilayer])

        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)

        if cap_dim:
            eval_inputs.append(cap_ids)

        train_inputs = eval_inputs + tag_ids_list

        print "-- train_inputs: ",
        print train_inputs  # [word_ids, pos_ids, chunk_ids, wh_ids, if_ids, s_ids, tag_ids, tag_ids1, tag_ids2]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            # print "train_inputs[9]", train_inputs[9]
            print "-- len(cost_list): ", len(cost_list)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, costall, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=costall,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

        else:
            f_train = None

        # Compile evaluation function
        tags_scores_out = tags_scores_list
        print "-- len(tags_scores_list): ", len(tags_scores_list)

        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores_out,
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward_n(zip(observations_list, transitions_list),
                                  viterbi=True,
                                  return_alpha=False,
                                  return_best_sequence=True),
                givens=({
                    is_train: np.cast['int32'](0)
                } if dropout else {})  #,
                # on_unused_input='ignore'
            )

        from pprint import pprint
        print "--------------------------------------------------------------"
        pprint(self.components)

        return f_train, f_eval  # return f_train, f_eval, f_test
Пример #12
0
    def build(
            self,
            dropout,
            char_dim,
            char_hidden_dim,
            char_bidirect,
            word_dim,
            word_hidden_dim,
            word_bidirect,
            tagger_hidden_dim,
            hamming_cost,
            L2_reg,
            lr_method,
            pre_word_emb,
            pre_char_emb,
            tagger,
            use_gaze,
            POS,
            plot_cost,
            #cap_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        # n_pos = len(self.id_to_pos) + 1

        # Number of capitalization features
        #if cap_dim:
        #    n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        word_ids = T.ivector(name='word_ids')  #声明整型一维向量
        char_for_ids = T.imatrix(name='char_for_ids')  # 声明整型二维矩阵
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        if POS:
            # pos_ids = T.ivector(name='pos_ids')
            pos_one_hot = T.imatrix(name='pos_one_hot')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        tag_ids = T.ivector(name='tag_ids')
        #if cap_dim:
        #    cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]  #句子中的单词数

        # Final input (all word features)
        input_dim = 0
        inputs = []
        L2_norm = 0.0

        theano.config.compute_test_value = 'off'
        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_word_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained word embeddings from %s...' % pre_word_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid word embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', word)]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained word embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained word embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (word_layer.embeddings**2).sum()

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_hidden_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_for_input = char_layer.link(char_for_ids)
            char_rev_input = char_layer.link(char_rev_ids)

            # Initialize with pretrained char embeddings
            if pre_char_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained char embeddings from %s...' % pre_char_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid char embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained char embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained char embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + +c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lowercasing + zero.') % (
                    c_found, c_lower + c_zeros)
            L2_norm += (char_layer.embeddings**2).sum()

            char_lstm_for = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_hidden_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_for_input)
            char_lstm_rev.link(char_rev_input)

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            for param in char_lstm_for.params[:8]:
                L2_norm += (param**2).sum()

            if char_bidirect:
                char_lstm_hidden = T.concatenate(
                    [char_for_output, char_rev_output], axis=1)
                input_dim += char_hidden_dim
                for param in char_lstm_rev.params[:8]:
                    L2_norm += (param**2).sum()

            else:
                char_lstm_hidden = char_for_output

            inputs.append(char_lstm_hidden)

        # if POS:
        # pos_dim = 20
        # input_dim += pos_dim
        # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer')
        # pos_input = pos_layer.link(pos_ids)
        # inputs.append(pos_input)
        # L2_norm += (pos_layer.embeddings ** 2).sum()

        #if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train,
                              input_test)  # 条件句

        # if POS:
        #     inputs = T.concatenate([inputs, pos_one_hot], axis= 1)
        #     input_dim += 6

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_hidden_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)  # 单词的顺序: I like dog
        word_lstm_rev.link(inputs[::-1, :])  # 单词的顺序: dog like I
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]

        for param in word_lstm_for.params[:8]:
            L2_norm += (param**2).sum()

        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)

            tanh_layer = HiddenLayer(2 * word_hidden_dim,
                                     word_hidden_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
            for param in word_lstm_rev.params[:8]:
                L2_norm += (param**2).sum()

        else:
            final_output = word_for_output

        dims = word_hidden_dim
        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = word_hidden_dim + n_tags

        if POS:
            final_output = T.concatenate([final_output, pos_one_hot], axis=1)
            dims += 6

        # if word_bidirect:
        #     final_output = T.concatenate(
        #         [word_for_output, word_rev_output],
        #         axis=1
        #     )
        #     tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim,
        #                              name='tanh_layer', activation='tanh')
        #     final_output = tanh_layer.link(final_output)
        # else:
        #     final_output = word_for_output

        # Sentence to Named Entity tags
        ## final_layer = HiddenLayer(dims, n_tags, name='final_layer',
        ##                           activation=(None if crf else 'softmax'))
        # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer',
        #                           activation=(None if crf else 'softmax'))
        ## tags_scores = final_layer.link(final_output)
        ## L2_norm += (final_layer.params[0] ** 2).sum()

        # No CRF
        if tagger == 'lstm':
            tagger_layer = LSTM_d(dims,
                                  tagger_hidden_dim,
                                  with_batch=False,
                                  name='LSTM_d')
            tagger_layer.link(final_output)
            final_output = tagger_layer.t

            dims = tagger_hidden_dim

            for param in tagger_layer.params[:8]:
                L2_norm += (param**2).sum()

        final_layer = HiddenLayer(
            dims,
            n_tags,
            name='final_layer',
            activation=(None if tagger == 'crf' else 'softmax'))
        tags_scores = final_layer.link(final_output)
        L2_norm += (final_layer.params[0]**2).sum()

        if tagger != 'crf':
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和
            all_paths_scores = forward(observations,
                                       transitions,
                                       hamming_cost=hamming_cost,
                                       n_tags=n_tags,
                                       padded_tags_ids=padded_tags_ids)
            L2_norm += (transitions**2).sum()
            cost = -(real_path_score - all_paths_scores) + L2_reg * L2_norm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)
            self.add_component(char_lstm_for)
            params.extend(char_lstm_for.params)

            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        # if POS:
        #     self.add_component(pos_layer)
        #     params.extend(pos_layer.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)

        if tagger == 'lstm':
            self.add_component(tagger_layer)
            params.extend(tagger_layer.params)
        elif tagger == 'crf':
            self.add_component(transitions)
            params.append(transitions)

        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        if POS:
            # eval_inputs.append(pos_ids)
            eval_inputs.append(pos_one_hot)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:
        #    eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}),
                                      on_unused_input='warn')
        else:
            f_train = None

        if plot_cost:
            f_plot_cost = theano.function(inputs=train_inputs,
                                          outputs=cost,
                                          givens=({
                                              is_train: np.cast['int32'](1)
                                          } if dropout else {}),
                                          on_unused_input='warn')
        else:
            f_plot_cost = None

        # Compile evaluation function
        if tagger != 'crf':
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         hamming_cost=0,
                                         n_tags=None,
                                         padded_tags_ids=None,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}),
                                     on_unused_input='warn')

        return f_train, f_eval, f_plot_cost
Пример #13
0
import numpy as np
import ipdb
from util import *
import nn

if __name__ == '__main__':
    params = {}

    #To test ff
    X = np.array([[1, 2, 3], [4, 5, 6]])
    nn.initialize_weights(X.shape[1], 2, params)
    f_post = nn.forward(X, params)

    #To test sigmoid
    # arr = np.array([1,3,5])
    # res = nn.sigmoid(arr)

    #To test softmax
    # X = np.array([[1,2,3],[4,5,8]])
    # res = nn.softmax(X)

    #To test compute_loss_and_acc
    # y = np.array([[1,0,0],[0,0,1]])
    # probs = np.array([[0.5,0.2,0.3],[0.4,0.3,0.3]])
    # loss,acc = nn.compute_loss_and_acc(y,probs)
    # print(loss)
    # print(acc)

    #To test get_random_batches
    # x = np.array([[1,2,3],[4,5,8],[51,21,3],[14,25,48],[11,12,23],[44,55,68],[71,82,39],[40,58,87]])
    # y = np.array([[1],[1],[0],[2],[4],[3],[0],[0]])
Пример #14
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                #for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (c_found, c_lower,
                                                       c_zeros)

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)

            #s_len # of words in sentence
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            #add padding to exist tag_scores(sentencelength * tag_ids)
            observations = T.concatenate([b_s, observations, e_s], axis=0)
            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)

            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Пример #15
0
    def build(
            self,
            dropout,
            ortho_char_input_dim,  # Should be inferred from the input
            ortho_char_dim,
            ortho_char_lstm_dim,
            char_bidirect,
            word_vec_input_dim,  # Should be inferred from the input wvecs
            word_dim,  # The vector size after projection of the input vector
            word_lstm_dim,
            word_bidirect,
            lr_method,
            crf,
            use_type_sparse_feats,
            type_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            type_sparse_feats_proj_dim,  # This is a hyper-parameter
            use_token_sparse_feats,
            token_sparse_feats_input_dim,  # Can be inferred from the output of the feature extractors
            # token_sparse_feats_proj_dim,  # This is a hyper-parameter
        use_ortho_attention,
            use_phono_attention,
            # use_convolution,
            phono_char_input_dim,  # Can be inferred
            phono_char_dim,
            phono_char_lstm_dim,
            training=True,
            **kwargs):
        """
        Build the network.
        """
        assert word_dim or phono_char_dim or ortho_char_dim, "No input selected while building the network!"
        # Training parameters
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')
        word_vecs = T.dmatrix(
            name="word_vecs")  # A vector for each word in the sentence
        #  => matrix: (len_sent, w_emb_dim)
        ortho_char_for_vecs = T.dtensor3(
            name="ortho_char_for_vecs"
        )  # For each char of each word in the sentence, a char vector
        # ortho_char_for_vecs = T.ftensor3(name="ortho_char_for_vecs")
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        ortho_char_rev_vecs = T.dtensor3(name="ortho_char_rev_vecs")
        # ortho_char_rev_vecs = T.ftensor3(name="ortho_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_wchar_len, char_emb_dim)
        phono_char_for_vecs = T.dtensor3(name="phono_char_for_vecs")
        # phono_char_for_vecs = T.ftensor3(name="phono_char_for_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_ortho_char_len, char_emb_dim)
        phono_char_rev_vecs = T.dtensor3(name="phono_char_rev_vecs")
        # phono_char_rev_vecs = T.ftensor3(name="phono_char_rev_vecs")
        # For each char of each word in the sentence, a char vector
        # => tensor of form: (len_sent, max_phono_char_len, char_emb_dim)
        ortho_char_pos_ids = T.ivector(name='ortho_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        phono_char_pos_ids = T.ivector(name='phono_char_pos_ids')
        # The word len for each word in the sentence => vect of form: (len_sent,)
        type_sparse_feats = T.imatrix(name="type_sparse_feats")
        # Type sparse features are appended to the input to the word lstm
        # For each word, a vector of type level sparse feats => mat of form: (len_sent, type_sparse_dim)
        token_sparse_feats = T.imatrix(name="token_sparse_feats")
        # Token sparse features are appended to the pre-crf layer
        # For each word, a vector of token level sparse feats => mat of form: (len_sent, token_sparse_dim)

        tag_ids = T.ivector(name='tag_ids')
        # The tag id for each word in the sentence => vect of form: (len_sent,)

        # Sentence length
        s_len = (word_vecs if word_dim else ortho_char_pos_ids
                 if ortho_char_dim else phono_char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = HiddenLayer(word_vec_input_dim,
                                     word_dim,
                                     activation="tanh",
                                     name="word_emb_proj")
            # TO DO : Try not using the bias term in the hidden layer
            word_input = word_layer.link(word_vecs)
            inputs.append(word_input)

        #
        # Chars inputs
        #
        if ortho_char_dim:
            input_dim += ortho_char_lstm_dim
            ortho_char_layer = HiddenLayer(ortho_char_input_dim,
                                           ortho_char_dim,
                                           activation="tanh",
                                           name="ortho_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            ortho_char_lstm_for = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_for')
            ortho_char_lstm_rev = LSTM(ortho_char_dim,
                                       ortho_char_lstm_dim,
                                       with_batch=True,
                                       name='ortho_char_lstm_rev')
            ortho_char_lstm_for.link(
                ortho_char_layer.link(ortho_char_for_vecs))
            ortho_char_lstm_rev.link(
                ortho_char_layer.link(ortho_char_rev_vecs))

            ortho_char_for_output = ortho_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]
            ortho_char_rev_output = ortho_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), ortho_char_pos_ids]

            inputs.append(ortho_char_for_output)
            if char_bidirect:
                inputs.append(ortho_char_rev_output)
                input_dim += ortho_char_lstm_dim

        if phono_char_dim:
            input_dim += phono_char_lstm_dim
            phono_char_layer = HiddenLayer(phono_char_input_dim,
                                           phono_char_dim,
                                           activation="tanh",
                                           name="phono_char_emb_proj")
            # TO DO : Try not using bias in the hidden layer
            phono_char_lstm_for = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_for')
            phono_char_lstm_rev = LSTM(phono_char_dim,
                                       phono_char_lstm_dim,
                                       with_batch=True,
                                       name='phono_char_lstm_rev')

            phono_char_lstm_for.link(
                phono_char_layer.link(phono_char_for_vecs))
            phono_char_lstm_rev.link(
                phono_char_layer.link(phono_char_rev_vecs))

            phono_char_for_output = phono_char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]
            phono_char_rev_output = phono_char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), phono_char_pos_ids]

            inputs.append(phono_char_for_output)
            if char_bidirect:
                inputs.append(phono_char_rev_output)
                input_dim += phono_char_lstm_dim

        # Type level sparse feats
        #
        if use_type_sparse_feats:
            input_dim += type_sparse_feats_input_dim
            type_level_sparse_layer = HiddenLayer(
                type_sparse_feats_input_dim,
                type_sparse_feats_proj_dim,
                activation="tanh",
                name='type_level_sparse_layer')
            # TO DO : Try not using the hidden layer here
            inputs.append(type_level_sparse_layer.link(type_sparse_feats))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)
            # TO DO : If using type sparse features, then apply hidden layer after concatenating all inputs
        else:
            inputs = inputs[0]
        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            """
            Drop out involves sampling a vector of bernoulli random variables with a parameter 1-p and using it as a mask
            So, the expected value of the dropped out input is p * (0*x) + (1-p) * (1*x) = (1-p) * x. Since biases will
            on average respond to the expected input value, at test time we multiply test inputs (1-p) to supply the
            expected test input instead.
            """
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        lstm_outputs = [word_for_output]
        post_word_lstm_output_size = word_lstm_dim
        if use_token_sparse_feats:
            # token_level_sparse_layer = HiddenLayer(token_sparse_feats_input_dim, token_sparse_feats_proj_dim,
            #                                       activation="tanh",
            #                                       name='token_level_sparse_layer')
            # # TO DO : Try not using the hidden layer here
            # lstm_outputs.append(token_level_sparse_layer.link(token_sparse_feats))
            # post_word_lstm_output_size += token_sparse_feats_proj_dim
            lstm_outputs.append(token_sparse_feats)
            post_word_lstm_output_size += token_sparse_feats_input_dim
        if word_bidirect:
            lstm_outputs.append(word_rev_output)
            post_word_lstm_output_size += word_lstm_dim

        if len(lstm_outputs) > 1:
            final_output = T.concatenate(lstm_outputs, axis=1)
            tanh_layer = HiddenLayer(post_word_lstm_output_size,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)

        else:
            final_output = word_for_output

        final_pre_crf_input_size = word_lstm_dim
        attention_vectors = []
        attention_vector_size = 0
        if use_ortho_attention and ortho_char_dim:
            # final_ortho_attention_input_layer = HiddenLayer(post_word_lstm_output_size, ortho_char_lstm_dim,
            #                                   name='final_ortho_attention_input_layer', activation='tanh')
            final_ortho_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                ortho_char_lstm_dim,
                name='final_ortho_attention_input_layer',
                activation='tanh')
            final_ortho_attention_input = final_ortho_attention_input_layer.link(
                final_output)
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            ortho_for_attention = self.get_TDAttention_vector(
                final_ortho_attention_input,
                ortho_char_lstm_for.h.dimshuffle((1, 0, 2)),
                ortho_char_pos_ids)
            if char_bidirect:
                ortho_rev_attention = self.get_TDAttention_vector(
                    final_ortho_attention_input,
                    ortho_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    ortho_char_pos_ids)
                attention_vectors.append(ortho_rev_attention)
                attention_vector_size += ortho_char_lstm_dim
            attention_vectors.append(ortho_for_attention)
            attention_vector_size += ortho_char_lstm_dim
        if use_phono_attention and phono_char_dim:
            # final_phono_attention_input_layer = HiddenLayer(post_word_lstm_output_size, phono_char_lstm_dim,
            #                                               name='final_phono_attention_input_layer', activation='tanh')
            final_phono_attention_input_layer = HiddenLayer(
                word_lstm_dim,
                phono_char_lstm_dim,
                name='final_phono_attention_input_layer',
                activation='tanh')
            # Evaluating attentional vector using a linear projection from final_output since the attention vector
            # must be conditioned on it and dimension must match the char lstm hidden dim.
            final_phono_attention_input = final_phono_attention_input_layer.link(
                final_output)
            phono_for_attention = self.get_TDAttention_vector(
                final_phono_attention_input,
                phono_char_lstm_for.h.dimshuffle((1, 0, 2)),
                phono_char_pos_ids)
            if char_bidirect:
                phono_rev_attention = self.get_TDAttention_vector(
                    final_phono_attention_input,
                    phono_char_lstm_rev.h.dimshuffle((1, 0, 2)),
                    phono_char_pos_ids)
                attention_vectors.append(phono_rev_attention)
                attention_vector_size += phono_char_lstm_dim
            attention_vectors.append(phono_for_attention)
            attention_vector_size += phono_char_lstm_dim
        if len(attention_vectors) > 1:
            attention_vectors = T.concatenate(attention_vectors, axis=1)

        if use_phono_attention or use_ortho_attention:
            final_output = T.concatenate([final_output, attention_vectors],
                                         axis=1)
            post_word_lstm_output_size += attention_vector_size
            final_pre_crf_input_size += attention_vector_size

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(final_pre_crf_input_size,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            # n_tags + 2 to accommodate start and end symbols

            small = -1000  # = -log(inf)
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            # Score of starting at start symbol is 1 => -log(1) = 0. Score of start symbol emitting any other NER
            # tag is -log(inf) = small
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            # Score of ending at end symbol is 1 => -log(1) = 0. Score of end symbol emitting any other NER
            # tag is -log(inf) = small
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            # observations is the emission energy (-log potential) between each token and each tag.
            # Emission score of intermediate words towards start and end tags is -log(inf)

            observations = T.concatenate([b_s, observations, e_s], axis=0)
            # observations now contains the emission energies for start token, sentence tokens and end token

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
            # Sum of energies associated with the gold tags

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()
            # Transition scores from label_i to label_{i+1}

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if ortho_char_dim:
            self.add_component(ortho_char_layer)
            self.add_component(ortho_char_lstm_for)
            params.extend(ortho_char_layer.params)
            params.extend(ortho_char_lstm_for.params)
            if char_bidirect:
                self.add_component(ortho_char_lstm_rev)
                params.extend(ortho_char_lstm_rev.params)

        if phono_char_dim:
            self.add_component(phono_char_layer)
            self.add_component(phono_char_lstm_for)
            params.extend(phono_char_layer.params)
            params.extend(phono_char_lstm_for.params)
            if char_bidirect:
                self.add_component(phono_char_lstm_rev)
                params.extend(phono_char_lstm_rev.params)

        if use_type_sparse_feats:
            self.add_component(type_level_sparse_layer)
            params.extend(type_level_sparse_layer.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)

        if word_bidirect or len(lstm_outputs) > 1:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        if use_ortho_attention and ortho_char_dim:
            self.add_component(final_ortho_attention_input_layer)
            params.extend(final_ortho_attention_input_layer.params)
        if use_phono_attention and phono_char_dim:
            self.add_component(final_phono_attention_input_layer)
            params.extend(final_phono_attention_input_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            # eval_inputs.append(word_ids)
            eval_inputs.append(word_vecs)
        if ortho_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(ortho_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(ortho_char_rev_vecs)
            eval_inputs.append(ortho_char_pos_ids)
        if phono_char_dim:
            # eval_inputs.append(char_for_ids)
            eval_inputs.append(phono_char_for_vecs)
            if char_bidirect:
                # eval_inputs.append(char_rev_ids)
                eval_inputs.append(phono_char_rev_vecs)
            eval_inputs.append(phono_char_pos_ids)

        if use_type_sparse_feats:
            eval_inputs.append(type_sparse_feats)
        if use_token_sparse_feats:
            eval_inputs.append(token_sparse_feats)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        print("Finished Compiling")
        return f_train, f_eval
Пример #16
0
    def build(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['NER']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            #for attention
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:

            #all_paths_scores = forward(observations, self.transitions)
            #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores)
            #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ;
            #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5);
            #cost=-error;
            #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2)

            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
            real_path_score += self.transitions[
                padded_tags_ids[T.arange(s_len)],
                padded_tags_ids[T.arange(s_len) + 1]].sum()

            all_paths_scores = forward(tags_scores, self.transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            import optimizers
            self.optimizer = optimizers.RMSprop(lr=0.001)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            self.constraints = {}
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
            #for debug
            #f_Debug = theano.function(
            #    inputs=train_inputs,
            #    outputs=cost,
            #    updates=self.update,
            #    givens=({is_train: np.cast['int32'](1)} if dropout else {})
            #)
            #debug end
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         tags_scores,
                                         self.transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Пример #17
0
    def build4(self, parameters):
        #{{{
        """
        Build the network.
        """
        #some parameters
        dropout = parameters['dropout']
        char_dim = parameters['char_dim']
        char_lstm_dim = parameters['char_lstm_dim']
        char_bidirect = parameters['char_bidirect']
        word_dim = parameters['word_dim']
        word_lstm_dim = parameters['word_lstm_dim']
        word_bidirect = parameters['word_bidirect']
        lr_method = parameters['lr_method']
        pre_emb = parameters['pre_emb']
        crf = parameters['crf']
        cap_dim = parameters['cap_dim']
        training = parameters['training']
        features = parameters['features']
        useAttend = parameters['useAttend']
        if useAttend:
            reloadParam = parameters['loading']
        else:
            reloadParam = None
        if reloadParam is not None:
            reloadPath = parameters['loading_path']
        sentencesLevelLoss = parameters['sentencesLevelLoss']

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        self.output_dim = len(self.id_to_tag)
        self.transitions = shared((self.output_dim + 1, self.output_dim),
                                  'transitions')

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        wordTrue_ids = T.ivector(name='wordTrue_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        docLen = T.ivector(name='docLen')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        #some features
        if features is not None and features['lemma']['isUsed']:
            lemma_ids = T.ivector(name='lemma_ids')
        if features is not None and features['pos']['isUsed']:
            pos_ids = T.ivector(name='pos_ids')
        if features is not None and features['chunk']['isUsed']:
            chunk_ids = T.ivector(name='chunk_ids')
        if features is not None and features['dic']['isUsed']:
            dic_ids = T.ivector(name='dic_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        # Word inputs
        #{{{
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            wordTrue_input = word_layer.link(wordTrue_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) words have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_words, 100. *
                        (c_found + c_lower + c_zeros) / n_words)
                print(
                    '%i found directly, %i after lowercasing, '
                    '%i after lowercasing + zero.') % (
                        c_found, c_lower, c_zeros)  #}}}

        # Chars inputs
#{{{
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim,
                                 char_lstm_dim,
                                 with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_rev_output = char_lstm_rev.h.dimshuffle(
                (1, 0, 2))[T.arange(s_len), char_pos_ids]
            char_output = T.concatenate([char_for_output, char_rev_output],
                                        axis=-1)
            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim
#}}}

# Capitalization feature
#
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        #add feature
#{{{
        if features is not None and features['lemma']['isUsed']:
            lemma_layer = EmbeddingLayer(features['lemma']['num'],
                                         features['lemma']['dim'],
                                         name='lemma_layer')
            if features['lemma']['pre_emb'] is not "":
                new_weights = lemma_layer.embeddings.get_value()
                loadPreEmbFeatures(features['lemma']['pre_emb'],
                                   features['feature_to_id_map']['lemma'],
                                   new_weights,
                                   lower=True)
                lemma_layer.embeddings.set_value(new_weights)
            lemma_output = lemma_layer.link(lemma_ids)
            if features['lemma']['lstm-input']:
                input_dim += features['lemma']['dim']
                inputs.append(lemma_output)
        if features is not None and features['pos']['isUsed']:
            pos_layer = EmbeddingLayer(features['pos']['num'],
                                       features['pos']['dim'],
                                       name='pos_layer')
            if features['pos']['pre_emb'] is not "":
                new_weights = pos_layer.embeddings.get_value()
                loadPreEmbFeatures(features['pos']['pre_emb'],
                                   features['feature_to_id_map']['pos'],
                                   new_weights)
                pos_layer.embeddings.set_value(new_weights)
            pos_output = pos_layer.link(pos_ids)
            if features['pos']['lstm-input']:
                input_dim += features['pos']['dim']
                inputs.append(pos_output)
        if features is not None and features['chunk']['isUsed']:
            chunk_layer = EmbeddingLayer(features['chunk']['num'],
                                         features['chunk']['dim'],
                                         name='chunk_layer')
            chunk_output = chunk_layer.link(chunk_ids)
            if features['chunk']['lstm-input']:
                input_dim += features['chunk']['dim']
                inputs.append(chunk_output)
        if features is not None and features['dic']['isUsed']:
            dic_layer = EmbeddingLayer(features['dic']['num'],
                                       features['dic']['dim'],
                                       name='dic_layer')
            dic_output = dic_layer.link(dic_ids)
            if features['dic']['lstm-input']:
                input_dim += features['dic']['dim']
                inputs.append(dic_output)
#}}}

# Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        if sentencesLevelLoss:

            def sentLSTM(i, output, input, lenVec):
                #{{{
                Len = lenVec[i]
                accLen = lenVec[:i].sum()
                currentInput = input[accLen:accLen + Len]
                word_lstm_for.link(currentInput)
                word_lstm_rev.link(currentInput[::-1, :])
                wordForOutput = word_lstm_for.h
                wordRevOutput = word_lstm_rev.h[::-1, :]
                finalOutput = T.concatenate([wordForOutput, wordRevOutput],
                                            axis=-1)
                output = T.set_subtensor(output[accLen:accLen + Len],
                                         finalOutput)
                return output
    #}}}

            result, update = theano.scan(
                fn=sentLSTM,
                outputs_info=T.zeros((inputs.shape[0], word_lstm_dim * 2),
                                     dtype='float32'),
                sequences=[T.arange(docLen.shape[0])],
                non_sequences=[inputs, docLen])

            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]

            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)
            final_output = result[-1]
        else:
            word_lstm_for.link(inputs)
            word_lstm_rev.link(inputs[::-1, :])
            word_for_output = word_lstm_for.h
            word_for_c = word_lstm_for.c
            word_rev_output = word_lstm_rev.h[::-1, :]
            word_rev_c = word_lstm_rev.c[::-1, :]
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=-1)
            final_c = T.concatenate([word_for_c, word_rev_c], axis=-1)

        if useAttend:
            #attention layer
            attended = []
            attendedDim = 0
            if features is not None and features['word']['attended']:
                attended.append(wordTrue_input)
                attendedDim += word_dim
            if features is not None and features['char']['attended']:
                attended.append(char_output)
                attendedDim += char_lstm_dim * 2
            if features is not None and features['lemma']['attended']:
                attended.append(lemma_output)
                attendedDim += features['lemma']['dim']
            if features is not None and features['pos']['attended']:
                attended.append(pos_output)
                attendedDim += features['pos']['dim']
            if features is not None and features['chunk']['attended']:
                attended.append(chunk_output)
                attendedDim += features['chunk']['dim']
            if features is not None and features['dic']['attended']:
                attended.append(dic_output)
                attendedDim += features['dic']['dim']

            attention_layer = AttentionLayer(
                attended_dim=attendedDim,
                state_dim=attendedDim,
                #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2,
                #                               state_dim=word_lstm_dim*2,
                source_dim=word_lstm_dim * 2,
                scoreFunName=parameters['attenScoreFun'],
                name='attention_layer')

            if len(attended) > 1:
                attendedInput = T.concatenate(attended, axis=-1)
            else:
                attendedInput = attended[0]

            final_output = attention_layer.link(attendedInput, attendedInput,
                                                final_output)
            #using lstm_state to compute attention
            #final_output=attention_layer.link(final_output,final_c,final_output);
            self.energy = attention_layer.energy
        else:
            final_output = final_output

        tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                 word_lstm_dim,
                                 name='tanh_layer',
                                 activation='tanh')
        final_output = tanh_layer.link(final_output)

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            if sentencesLevelLoss:
                #calcuate loss according to sentence instead of docLen
                def sentLoss(i, scores, trueIds, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentIds = trueIds[accLen:accLen + Len]
                    real_path_score = currentTagsScores[T.arange(Len),
                                                        currentIds].sum()
                    # Score from transitions
                    padded_tags_ids = T.concatenate([[n_tags], currentIds],
                                                    axis=0)
                    real_path_score += transitions[
                        padded_tags_ids[T.arange(Len)],
                        padded_tags_ids[T.arange(Len) + 1]].sum()

                    all_paths_scores = forward(currentTagsScores, transitions)
                    cost = -(real_path_score - all_paths_scores)
                    return cost

    #}}}

                result, update = theano.scan(
                    fn=sentLoss,
                    outputs_info=None,
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[
                        tags_scores, tag_ids, self.transitions, docLen
                    ])
                cost = result.sum()
            else:
                real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

                # Score from transitions
                padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
                real_path_score += self.transitions[
                    padded_tags_ids[T.arange(s_len)],
                    padded_tags_ids[T.arange(s_len) + 1]].sum()

                all_paths_scores = forward(tags_scores, self.transitions)
                cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(self.transitions)
            params.append(self.transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)
        #add feature layer
        if features is not None and features['lemma']['isUsed']:
            self.add_component(lemma_layer)
            params.extend(lemma_layer.params)
        if features is not None and features['pos']['isUsed']:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if features is not None and features['chunk']['isUsed']:
            self.add_component(chunk_layer)
            params.extend(chunk_layer.params)
        if features is not None and features['dic']['isUsed']:
            self.add_component(dic_layer)
            params.extend(dic_layer.params)

        if useAttend and reloadParam:
            #reload pre-train params
            model_path = self.model_path
            self.model_path = reloadPath
            print "loading:", self.model_path
            self.reload(features)
            self.model_path = model_path

        if useAttend:
            #add attention_layer
            self.add_component(attention_layer)
            params.extend(attention_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        if useAttend:
            eval_inputs.append(wordTrue_ids)
            if sentencesLevelLoss:
                eval_inputs.append(docLen)
        #add feature input
        if features is not None and features['lemma']['isUsed']:
            eval_inputs.append(lemma_ids)
        if features is not None and features['pos']['isUsed']:
            eval_inputs.append(pos_ids)
        if features is not None and features['chunk']['isUsed']:
            eval_inputs.append(chunk_ids)
        if features is not None and features['dic']['isUsed']:
            eval_inputs.append(dic_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            #constraints
            if useAttend:
                self.constraints = attention_layer.constraints
            else:
                self.constraints = {}
            from keras import optimizers
            self.optimizer = optimizers.SGD(lr=0.001,
                                            momentum=0.9,
                                            decay=0.,
                                            nesterov=True,
                                            clipvalue=5)
            self.optimizer = optimizers.RMSprop()
            #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01)
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name,
                cost,
                params,
                constraints=self.constraints,
                **lr_method_parameters)
            #updates = self.optimizer.get_updates(params,self.constraints,cost);
            f_train_outputs = [cost]
            if useAttend:
                f_train_outputs.append(self.energy)

            f_train = theano.function(inputs=train_inputs,
                                      outputs=f_train_outputs,
                                      updates=updates,
                                      on_unused_input='ignore',
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))

            f_test = theano.function(inputs=train_inputs,
                                     outputs=cost,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            self.f_test = f_test
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            if sentencesLevelLoss:

                def sentVitebe(i, predictTag, scores, transitions, lenVec):
                    #{{{
                    Len = lenVec[i]
                    accLen = lenVec[:i].sum()
                    currentTagsScores = scores[accLen:accLen + Len]
                    currentPredictIds = forward(currentTagsScores,
                                                transitions,
                                                viterbi=True,
                                                return_alpha=False,
                                                return_best_sequence=True)
                    predictTag = T.set_subtensor(
                        predictTag[accLen:accLen + Len], currentPredictIds)
                    return predictTag
                    #}}}

                predictTag, update = theano.scan(
                    fn=sentVitebe,
                    outputs_info=T.zeros((tags_scores.shape[0], ),
                                         dtype='int32'),
                    sequences=[T.arange(docLen.shape[0])],
                    non_sequences=[tags_scores, self.transitions, docLen])
                predictTag = predictTag[-1]
            else:
                predictTag = forward(tags_scores,
                                     self.transitions,
                                     viterbi=True,
                                     return_alpha=False,
                                     return_best_sequence=True)
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=predictTag,
                                     on_unused_input='ignore',
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
            #f_AttenVisual=theano.function(
            #    inputs=eval_inputs,
            #    outputs=[predictTag,self.energy],
            #    on_unused_input='ignore',
            #    givens=({is_train: np.cast['int32'](0)} if dropout else {})
            #    )
            #self.f_AttenVisual=f_AttenVisual;

        return f_train, f_eval
Пример #18
0
def accuracy(nn, data, label, thr = 0.5):
    predict  = [ np.int8(nnDif.forward(nn,data[c,:]) > thr) == label[c] for c in range(data.shape[0])]
    return 100 * np.double(len(np.where(np.asarray(predict)==False)[0]))/np.double(len(predict))
Пример #19
0
    def build(self,
              dropout,
              char_dim,
              char_hidden_dim,
              char_bidirect,
              layer2_hidden_dim,
              lr_method,
              layer2,
              batch_size,
              pre_emb,
              use_gaze,
              crf,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Network variables
        is_train = T.iscalar('is_train')  # declare variable,声明整型变量is_train
        char_ids = T.ivector(name='char_ids')  #声明整型一维向量
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        # tag_ids = T.imatrix(name='tag_ids')
        tag_ids = T.ivector(name='tag_ids')
        # Sentence length
        s_len = char_ids.shape[0]  #每个句子中的字数

        # Final input (all word features)
        #
        # Char inputs
        #
        if char_dim:
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_input = char_layer.link(char_ids)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(
                        codecs.open(pre_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif char.lower() in pretrained:
                        new_weights[i] = pretrained[char.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[re.sub('\d', '0', char)]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print(
                    '%i / %i (%.4f%%) chars have been initialized with '
                    'pretrained embeddings.') % (
                        c_found + c_lower + c_zeros, n_chars, 100. *
                        (c_found + c_lower + c_zeros) / n_chars)
                print('%i found directly, %i after lower, %i after zero.') % (
                    c_found, c_lower, c_zeros)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(char_input)
            input_test = (1 - dropout) * char_input
            char_input = T.switch(T.neq(is_train, 0), input_train,
                                  input_test)  # 条件句

        # LSTM for chars, first layer
        char_lstm_for1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_for')
        char_lstm_rev1 = LSTM(char_dim,
                              char_hidden_dim,
                              with_batch=False,
                              name='first_char_lstm_rev')
        char_lstm_for1.link(char_input)  # char的顺序: l i k e
        char_lstm_rev1.link(char_input[::-1, :])  # 单词的顺序: e k i l
        char_for_output1 = char_lstm_for1.h
        char_rev_output1 = char_lstm_rev1.h[::-1, :]

        if char_bidirect:
            final_output = T.concatenate([char_for_output1, char_rev_output1],
                                         axis=1)
            tanh_layer1 = HiddenLayer(2 * char_hidden_dim,
                                      char_hidden_dim,
                                      name='tanh_layer1',
                                      activation='tanh')
            final_output = tanh_layer1.link(final_output)
        else:
            final_output = char_for_output1

        if layer2:
            #
            # Dropout on final input
            #
            if dropout:
                dropout_layer = DropoutLayer(p=dropout)
                input_train = dropout_layer.link(final_output)
                input_test = (1 - dropout) * final_output
                final_output = T.switch(T.neq(is_train, 0), input_train,
                                        input_test)  # 条件句

            # LSTM for chars, second layer
            char_lstm_for2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_for')
            char_lstm_rev2 = LSTM(char_hidden_dim,
                                  layer2_hidden_dim,
                                  with_batch=False,
                                  name='second_char_lstm_rev')
            char_lstm_for2.link(final_output)
            char_lstm_rev2.link(final_output[::-1, :])
            char_for_output2 = char_lstm_for2.h
            char_rev_output2 = char_lstm_rev2.h[::-1, :]

            if char_bidirect:
                final_output = T.concatenate(
                    [char_for_output2, char_rev_output2], axis=1)
                tanh_layer2 = HiddenLayer(2 * layer2_hidden_dim,
                                          layer2_hidden_dim,
                                          name='tanh_layer2',
                                          activation='tanh')
                final_output = tanh_layer2.link(final_output)
            else:
                final_output = char_for_output2

        if layer2:
            dims = layer2_hidden_dim
        else:
            dims = char_hidden_dim

        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis=1)
            dims = dims + n_tags

        # final_output = T.reshape(final_output, (-1, input_dim))

        # Sentence to Named Entity tags - Score,ci与CRF之间的隐含层
        final_layer = HiddenLayer(dims,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len),
                                          tag_ids].sum()  # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]].sum()  # A中对应元素的求和

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)

        self.add_component(char_lstm_for1)
        params.extend(char_lstm_for1.params)
        if char_bidirect:
            self.add_component(char_lstm_rev1)
            params.extend(char_lstm_rev1.params)

            self.add_component(tanh_layer1)
            params.extend(tanh_layer1.params)

        if layer2:
            self.add_component(char_lstm_for2)
            params.extend(char_lstm_for2.params)
            if char_bidirect:
                self.add_component(char_lstm_rev2)
                params.extend(char_lstm_rev2.params)

                self.add_component(tanh_layer2)
                params.extend(tanh_layer2.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)

        # Prepare train and eval inputs
        eval_inputs = []
        if char_dim:
            eval_inputs.append(char_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval
Пример #20
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              pre_voc,
              crf,
              pos_dim,
              n_pos,
              training = 1,
              **kwargs
              ):
        """
        Build the network.
        """

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_y)
        n_cap = 2

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        cap_ids = T.ivector(name='cap_ids')
        if pos_dim:
            pos_ids = T.ivector(name='pos_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                emb_matrix = np.load(pre_emb)
                pre_w2idxs = dict([(w,i) for i,w in enumerate(np.load(pre_voc))])
                print pre_w2idxs.items()[:10]
                assert emb_matrix[0].shape[0] == word_dim
                for w in pre_w2idxs:
                    pretrained[w.lower()] = np.array(
                        [float(x) for x in emb_matrix[pre_w2idxs[w]]]).astype(np.float32)
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      ) 



        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Cue feature
        #

        input_dim += word_dim
        cap_layer = EmbeddingLayer(n_cap, word_dim, name='cap_layer')
        inputs.append(cap_layer.link(cap_ids))

        #
        # POS feature
        #

        if pos_dim:
            input_dim += word_dim
            pos_layer = EmbeddingLayer(n_pos, word_dim, name="pos_layer")
            inputs.append(pos_layer.link(pos_ids))

        # Prepare final input
        # if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        # Add cue layer (cap for the moment)
        self.add_component(cap_layer)
        params.extend(cap_layer.params)
        # Add pos tag layer
        if pos_dim:
	    self.add_component(pos_layer)
            params.extend(pos_layer.params)

        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        # add cue vector to the inputs
        eval_inputs.append(cap_ids)
        # add pos vector to the inputs
        if pos_dim:
            eval_inputs.append(pos_ids)

        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval
Пример #21
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)

            # Initialize with pretrained embeddings
            if pre_emb and training:
                
                # Randomly generates new weights
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                
                # Here is where we will substitute pyemblib read function.
                # Syntax: get_embedding_dict(emb_path, emb_format, first_n, vocab)
                emb_format = pyemblib2.Format.Word2Vec
                pretrained = get_embedding_dict(pre_emb, emb_format, 0, None)
                ''' 
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                '''
                
                c_found = 0
                c_lower = 0
                c_zeros = 0

                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                
                # This is it, this is what needs to be printed.
                # "word_layer.embeddings" is a "theano.shared" object 
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      )

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        inputs = T.concatenate(inputs, axis=1) if len(inputs) != 1 else inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)

            # Supposedly the commented-out line below will stop
            # the model from updating the pretrained emeddings.
 
            # params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            
            # "params" supposedly contains the pretrained embedding matrix that we are updating. 
            # Find the "get_updates" function and figure out what it does.  
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
            #========================================
            # FUNCTION TO PRINT PRETRAINED EMBEDDINGS
            # The function below takes one argument, which it prints
            # along with the specified print message.
            print_matrix = T.dmatrix() 
            print_op = printing.Print('print message') 
            printed_x = print_op(print_matrix)
            f_print = function([print_matrix], printed_x) 
            #========================================
        else:
            f_train = None
            f_print = None

        # We return a tuple of things used to print the embedding so that it looks nicer. 
        print_tuple = [f_print, word_layer.embeddings]

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

        return f_train, f_eval, print_tuple