示例#1
0
    def __call__(self, tokenized_sentences_lst):
        # Input placeholders to the biLM.
        context_character_ids = tf.placeholder('int32', shape=(None, None, self.max_characters_per_token))

        # Get ops to compute the LM embeddings.
        context_embeddings_op = self.bilm(context_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
        elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)

        # Now we can compute embeddings.
        context_tokens  = [sentence.split() for sentence in tokenized_sentences_lst]

        with tf.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.global_variables_initializer())

            # Create batches of data.
            context_ids = self.batcher.batch_sentences(context_tokens)

            # Compute ELMo representations (here for the input only, for simplicity).
            elmo_context_vecs = sess.run(
            [elmo_context_input['weighted_op']],
            feed_dict={context_character_ids: context_ids}
            )

        return elmo_context_vecs[0]  #, context_tokens, context_ids
示例#2
0
    def build_embeddings_op(self, context_ids_ph, utterances_ids_ph,
                            context_sentence_ids_ph):

        bilm = BidirectionalLanguageModel(
            self.elmo_options_file,
            self.elmo_weight_file,
            use_character_inputs=False,
            embedding_weight_file=self.token_embedding_file)

        context_emb_op = bilm(context_ids_ph)
        utterances_emb_op = bilm(utterances_ids_ph)
        context_sentence_emb_op = bilm(context_sentence_ids_ph)

        elmo_context_input = weight_layers('input',
                                           context_emb_op,
                                           l2_coef=0.0)
        with tf.variable_scope('', reuse=True):
            elmo_utterances_input = weight_layers('input',
                                                  utterances_emb_op,
                                                  l2_coef=0.0)
            elmo_context_sentence_input = weight_layers(
                'input', context_sentence_emb_op, l2_coef=0.0)

        return (elmo_context_input, elmo_utterances_input,
                elmo_context_sentence_input)
    def __init__(self):
        self.vocab_file = 'vocab_small.txt'
        # Location of pretrained LM.  Here we use the test fixtures.
        datadir = os.path.join('pretrained')
        options_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
        weight_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

        # Dump the token embeddings to a file. Run this once for your dataset.
        token_embedding_file = 'elmo_token_embeddings.hdf5'
        dump_token_embeddings(self.vocab_file, options_file, weight_file,
                              token_embedding_file)

        self.batcher = TokenBatcher(self.vocab_file)
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file)
        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                context_embeddings_op,
                                                l2_coef=0.0)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)
示例#4
0
    def bilm_build_graph(options_file, weight_file):
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file, weight_file)

        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(context_elmo)
        question_embeddings_op = bilm(question_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)['weighted_op']
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)['weighted_op']
        """
        elmo_context_output = weight_layers(
            'output', context_embeddings_op, l2_coef=0.0
        )['weighted_op']

        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_output = weight_layers(
                'output', question_embeddings_op, l2_coef=0.0
            )

        """
        return elmo_context_input, elmo_question_input
示例#5
0
 def elmo_input_embedding(self, tag):
     que1_embeddings_op = self.bilm(self.que1)
     que2_embeddings_op = self.bilm(self.que2)
     elmo_que1 = weight_layers(tag, que1_embeddings_op,
                               l2_coef=0.)['weighted_op']
     with tf.variable_scope('', reuse=True):
         elmo_que2 = weight_layers(tag, que2_embeddings_op,
                                   l2_coef=0.)['weighted_op']
     return elmo_que1, elmo_que2
示例#6
0
    def add_embeddings_op(self):
        """Defines self.word_embeddings"""
        b_size = tf.shape(self.cand_entities_ids)[0]
        cand_spans = tf.shape(self.cand_entities_ids)[1]
        cand_ents = tf.shape(self.cand_entities_ids)[2]
        entities = tf.reshape(self.cand_entities_ids, [b_size, cand_spans, cand_ents // 22, 22])
        entities = tf.reshape(entities, [-1, 22])
        zeros_count = tf.reduce_sum(tf.cast(tf.equal(entities, 0), tf.int32), axis=1)
        lengths = tf.math.maximum(0, 20 - zeros_count)

        with tf.variable_scope('bilm_1'):
            entitites_embeddings_op = self.entity_bilm(entities) # [batch_size, max_token]
        with tf.variable_scope('bilm_2'):
            words_embeddings_op = self.bilm(self.words)

        with tf.variable_scope("words"):
            self.word_embeddings = weight_layers('words', words_embeddings_op, l2_coef=0.0)['weighted_op']
            print("word_embeddings (after lookup) ", self.word_embeddings)

        with tf.variable_scope("entities"):
            from preprocessing.util import load_wikiid2nnid
            self.nentities = len(load_wikiid2nnid(extension_name=self.args.entity_extension))
            self.entity_embeddings =  tf.reshape(weight_layers(
                'entities',
                 entitites_embeddings_op,
                 l2_coef=0.0
            )['weighted_op'], [b_size, cand_spans, cand_ents // 22, 20, -1]) # [batch_size, max_token, vdim]

            #cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2)
            #cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2)
            #(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
            #        cell_fw, cell_bw, output,
            #        sequence_length=lengths, dtype=tf.float32)
            #output = tf.concat([output_fw, output_bw], axis=-1)
            #output = tf.concat([output[:, 0, :], output[:, -1, :]], axis=-1)

            # coeffs = tf.nn.softmax(tf.squeeze(tf.layers.dense(output, 1)))
            # output = tf.reduce_sum(output * coeffs[..., None], 1)
            # self.entity_embeddings = tf.layers.dense(tf.reshape(output, [b_size, cand_spans, cand_ents // 22, 256]), 300)

            #mask = tf.math.logical_not(tf.equal(entities, 0)[:, 1:-1])
            #Q = tf.layers.dense(output, self.args.hidden_size_lstm)  # [batch_size, sequence_length, hidden_dim]
            #K = tf.layers.dense(output, self.args.hidden_size_lstm)  # [batch_size, sequence_length, hidden_dim]
            #V = tf.layers.dense(output, 300)  # [batch_size, sequence_length, n_classes]
            #query_value_attention_seq = tf.keras.layers.Attention()([Q, V, K], [mask, mask])
            #query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(query_value_attention_seq)
            #self.entity_embeddings = tf.reshape(query_value_attention, [b_size, cand_spans, cand_ents // 22, -1])


            # self.entity_embeddings = util.ffnn(self.entity_embeddings, 1, 300, 300, dropout=None)
            self.pure_entity_embeddings = self.entity_embeddings
            if self.args.ent_vecs_regularization.startswith("l2"):  # 'l2' or 'l2dropout'
                self.entity_embeddings = tf.nn.l2_normalize(self.entity_embeddings, dim=3)
                # not necessary since i do normalization in the entity embed creation as well, just for safety
            if self.args.ent_vecs_regularization == "dropout" or \
                            self.args.ent_vecs_regularization == "l2dropout":
                self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
示例#7
0
    def _embed_ids(self):
        print('[launch] embed_ids, use_ELMO')
        with tf.name_scope('text_embedding_layer'):

            # Build the biLM graph.
            if self.params.USE_CHAR_ELMO:
                bilm = BidirectionalLanguageModel(
                    options_file=self.data_path + self.params.ELMO_OPTIONS,
                    weight_file=self.data_path + self.params.ELMO_WEIGHTS,
                    max_batch_size=self.params.batch_size *
                    self.params.MAX_SENTENCES)
            else:
                bilm = BidirectionalLanguageModel(
                    options_file=self.data_path + self.params.ELMO_OPTIONS,
                    weight_file=self.data_path + self.params.ELMO_WEIGHTS,
                    use_character_inputs=False,
                    embedding_weight_file=self.data_path +
                    self.params.ELMO_TOKEN,
                    max_batch_size=self.params.batch_size *
                    self.params.MAX_SENTENCES)

            # question
            self.embed_q_op = bilm(self.batch_q)
            self.elmo_q_output = weight_layers('output',
                                               self.embed_q_op,
                                               l2_coef=0.0)
            self.embed_q_inter = self.elmo_q_output['weighted_op']
            '''
            self.q_len_to_pad = self.params.MAX_LENGTH_Q - tf.reduce_max( self.batch_len_q ) -1
            self.q_len_to_pad = tf.maximum(self.q_len_to_pad, 0)
            self.embed_q = tf.pad( self.embed_q_inter, [[0,0], [0, self.q_len_to_pad], [0,0]] )
            '''
            self.embed_q = self.embed_q_inter

            # sentence
            self.embed_s_op = bilm(self.batch_s)
            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                self.elmo_s_output = weight_layers('output',
                                                   self.embed_s_op,
                                                   l2_coef=0.0)
            self.embed_s_inter = self.elmo_s_output['weighted_op']

            self.s_len_to_pad = self.params.MAX_SENTENCES - tf.reduce_max(
                self.batch_len_s) - 1
            self.s_len_to_pad = tf.maximum(self.s_len_to_pad, 0)
            #self.embed_s = tf.pad( self.embed_s_inter, [[0,0], [0, self.s_len_to_pad], [0,0]] )

            # [batch_size, max_len (data dependent), elmo_embedding]
            self.embed_q = self.embed_q_inter

            # [batch_size, MAX_SENTENCES, max_len (data dependent), elmo_embedding]
            self.embed_s = tf.reshape(self.embed_s_inter, [
                self.params.batch_size, self.params.MAX_SENTENCES, -1,
                self.params.DIM_WORD_EMBEDDING
            ])
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    vocab_file = os.path.join(directory, 'vocab.txt.gz')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
示例#9
0
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')
    with open(options_file, 'r') as f:
        m_options = json.load(f)
    max_chars = m_options['char_cnn']['max_characters_per_token']

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, max_chars)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.compat.v1.placeholder('int32', shape=(None, None, max_chars))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=128)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
示例#10
0
 def embed_sent_batch(self, sentences, length):
     sentences_tokenid = self._token_batcher.batch_sentences(sentences)
     # s_tokenid = s_tokenid[1:][:-1]
     tf.reset_default_graph()
     processed_sentences_tokenid = []
     length += 2 # Take into account <s> and </s>
     for s_tokenid in sentences_tokenid:
         if (len(s_tokenid) >= length):
             s_tokenid = s_tokenid[:length]
         else:
             s_tokenid = np.pad(s_tokenid, (0, length - len(s_tokenid)), 'constant', constant_values=(0))
         #s_tokenid = np.expand_dims(s_tokenid, axis=0)
         processed_sentences_tokenid.append(s_tokenid)
     batch_size = len(processed_sentences_tokenid)
     processed_sentences_tokenid = np.array(processed_sentences_tokenid)
     # tf
     with tf.device("/cpu:0"):
         context_token_ids = tf.placeholder('int32', shape=(batch_size, length))
         context_embeddings_op = self._bilm(context_token_ids)
         elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)['weighted_op']
         config = tf.ConfigProto()
         config.gpu_options.allow_growth = True
         print ('++++++Check_point_1\n')
         with tf.Session(config=config) as sess:
             sess.run([tf.global_variables_initializer()])
             elmo_context_output_ = sess.run([elmo_context_output],feed_dict={context_token_ids: processed_sentences_tokenid})[0]
     #print (elmo_context_output_.shape)
     return elmo_context_output_
    def __init__(self, session, bilm_params):
        self.params = bilm_params

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(self.params.vocab_file,
                               self.params.max_char_len)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.placeholder(
            'int32', shape=(None, None, self.params.max_char_len))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            self.params.options_file,
            self.params.weights_file,
        )

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 l2_coef=0.0,
                                                 use_top_only=True)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())
示例#12
0
def load_elmo_embeddings(directory, top=True):
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file,
                                      weight_file,
                                      max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    # Our model includes ELMo at both the input and output layers
    # of the task GRU, so we need 2x ELMo representations at each of the input and output.

    elmo_sentence_input = weight_layers('input',
                                        sentence_embeddings_op,
                                        use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
示例#13
0
    def call(self, x, mask=None):
        context_embeddings_op = self.bilm(x)
        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)
        elmo = elmo_context_input['weighted_op']

        return elmo
    def add_elmo_embedding_layer(self,
                                 options_file,
                                 weight_file,
                                 output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.
        1. self.elmo_context_input (batch size, max_context_len among the batch, 1024)
        2. self.elmo_question_input (batch size, max_qn_len among the batch, 1024)
        If output_use is true:
            add the output to the graph either

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)

        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers(
            'input', context_embeddings_op, l2_coef=0.001
        )['weighted_op']  #(batch size, max_context_len among the batch, 1024)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001)['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001)['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op,
                    l2_coef=0.001)['weighted_op']
示例#15
0
 def __elmo_embedding(self, inputs, masks, keep_prob=0.8):
     """Compute ELMo embeddings.
     """
     from bilm import weight_layers
     elmo_embeddings_op = self.elmo_bilm(inputs)
     elmo_input = weight_layers('input', elmo_embeddings_op, l2_coef=0.0)
     elmo_embeddings = elmo_input['weighted_op'] # (batch_size, sentence_length, elmo_dim)
     # masking(remove noise due to padding)
     elmo_embeddings *= masks
     return tf.nn.dropout(elmo_embeddings, keep_prob)
示例#16
0
    def word_embedding(self):
        bilm = BidirectionalLanguageModel(
            self.options_file,
            self.weight_file,
            use_character_inputs=False,
            embedding_weight_file=self.token_embedding_file)
        context_embeddings_op = bilm(self.W_P)
        question_embeddings_op = bilm(self.W_Q)

        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)
        self.p_embed, self.q_embed = elmo_context_input[
            'weighted_op'], elmo_question_input['weighted_op']
示例#17
0
def elmo_embedding(options_file, weight_file, token_a_character_ids,
                   token_b_character_ids):
    # Input placeholders to the biLM.
    # token_a_character_ids = tf.placeholder('int32', shape=(None, None, 50))
    # token_b_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file)

    # Get ops to compute the LM embeddings.
    token_a_embeddings_op = bilm(token_a_character_ids)
    token_b_embeddings_op = bilm(token_b_character_ids)

    elmo_token_a = weight_layers('input', token_a_embeddings_op, l2_coef=0.0)
    with tf.variable_scope('', reuse=True):
        # the reuse=True scope reuses weights from the context for the question
        elmo_token_b = weight_layers('input',
                                     token_b_embeddings_op,
                                     l2_coef=0.0)

    return elmo_token_a['weighted_op'], elmo_token_b['weighted_op']
示例#18
0
    def __init__(self, config):
        self.lr = config["lr"]
        self.input_dropout = config["dropout"]
        self.lstm_dim = config["lstm_dim"]
        self.layer_type = config["layer_type"]
        self.use_attention = config["attention"]
        self.num_attention_heads = config['num_attention_heads']
        self.size_per_head = config['size_per_head']
        self.num_tags = 7
        self.char_dim = 300
        self.global_step = tf.Variable(0, trainable=False)
        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
        self.initializer = initializers.xavier_initializer()

        # elmo
        self.batcher = TokenBatcher(config['vocab_file'])
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(
            config['options_file'],
            config['weight_file'],
            use_character_inputs=False,
            embedding_weight_file=config['token_embedding_file'])
        self.context_embeddings_op = self.bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                self.context_embeddings_op,
                                                l2_coef=0.0)['weighted_op']

        # add placeholders for the model
        self.mask_inputs = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="ChatInputs")
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=[None, None],
                                      name="Targets")

        # dropout keep prob
        self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout")
        used = tf.sign(tf.abs(self.mask_inputs))
        length = tf.reduce_sum(used, reduction_indices=1)
        self.lengths = tf.cast(length, tf.int32)
        self.batch_size = tf.shape(self.mask_inputs)[0]
        self.num_steps = tf.shape(self.mask_inputs)[-1]

        self.logits = self.inference(self.elmo_context_input)
        # loss of the model
        self.loss = self.loss_layer(self.logits, self.lengths)
        self.train_op = self.train(self.loss)
        # saver of the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
示例#19
0
文件: Elmo.py 项目: xujunrt/NER-1
    def _add_elmo_embedding(self):
        """
        The Elmo embedding layer
        """
        embeddings_op = self.elmo_bilm(self.elmo_p)
        self.elmo_emb = weight_layers('input', embeddings_op)['weighted_op']

        if self.elmo_mode == 1:
            # concat word emb and elmo emb
            self.embedding_layer = tf.concat(
                [self.embedding_layer, self.elmo_emb], 2)
        else:
            # Default: only use Elmo
            self.embedding_layer = self.elmo_emb
示例#20
0
    def _load_embeddings(self,
                         vocab="vocab.txt",
                         options="elmo_options.json",
                         weights="elmo_weights.hdf5"):
        self.elmo_model = BidirectionalLanguageModel(options, weights)
        self.batcher = Batcher(vocab, 50)

        self.character_ids = tf.placeholder('int32', shape=(None, None, 50))
        context_embeddings_op = self.elmo_model(self.character_ids)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)

        tf.global_variables_initializer().run()
示例#21
0
    def __init__(self, path=embedding_path, embedding_dim=512,
                 sentence_len=max_sentence_len, pair_mode=False):
        embeddings = dict()

        self.embedding_path = path
        self.embedding_dim = embedding_dim
        self.sentence_len = sentence_len
        self.pair_mode = pair_mode
        self.embedding_dict = embeddings

        g_elmo = tf.Graph()
        vocab_file = './bilmelmo/data/vocab.txt'
        options_file = './bilmelmo/try/options.json'
        weight_file = './bilmelmo/try/weights.hdf5'
        token_embedding_file = './bilmelmo/data/vocab_embedding.hdf5'

        with tf.Graph().as_default() as g_elmo:
            self.batcher = TokenBatcher(vocab_file)
            self.context_token_ids = tf.placeholder('int32', shape=(None, None))
            self.bilm = BidirectionalLanguageModel(
                options_file,
                weight_file,
                use_character_inputs=False,
                embedding_weight_file=token_embedding_file
            )

            self.context_embeddings_op = self.bilm(self.context_token_ids)
            self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0)

            self.elmo_context_output = weight_layers(
                'output', self.context_embeddings_op, l2_coef=0.0
            )
            init = tf.global_variables_initializer()
        sess_elmo = tf.Session(graph=g_elmo)
        sess_elmo.run(init)
        self.sess_elmo = sess_elmo
示例#22
0
 def weight_layers(self,
                   name,
                   bilm_ops,
                   l2_coef=None,
                   use_top_only=False,
                   do_layer_norm=False):
     '''
     Weight the layers of a biLM with trainable scalar weights to compute ELMo representations.
     See more details on https://github.com/allenai/bilm-tf/blob/81a4b54937f4dfb93308f709c1cf34dbb37c553e/bilm/elmo.py
     {
        'weighted_op': op to compute weighted average for output,
        'regularization_op': op to compute regularization term
     }
     '''
     return weight_layers(name, bilm_ops, l2_coef, use_top_only,
                          do_layer_norm)
示例#23
0
 def __init__(self, config):
     super(NERModel, self).__init__(config)
     self.idx_to_tag = {
         idx: tag
         for tag, idx in list(self.config.vocab_tags.items())
     }
     if self.config.use_elmo:
         # self.elmo_inputs = []
         self.batcher = Batcher(self.config.filename_words, 50)
         self.bilm = BidirectionalLanguageModel(
             self.config.filename_elmo_options,
             self.config.filename_elmo_weights)
         self.elmo_token_ids = tf.placeholder('int32',
                                              shape=(None, None, 50))
         self.elmo_embeddings_op = self.bilm(self.elmo_token_ids)
         self.elmo_embeddings_input = weight_layers('input',
                                                    self.elmo_embeddings_op,
                                                    l2_coef=0.0)
示例#24
0
文件: elmo.py 项目: sdadas/yast
 def __lambda_layer(x):
     import tensorflow as tf
     from utils.files import ProjectPath
     from bilm import BidirectionalLanguageModel, all_layers, weight_layers
     x_input = tf.cast(x, tf.int32)
     input_dir = ProjectPath.from_dict(path_dict)
     options_file: str = input_dir.join("options.json").get()
     weight_file: str = input_dir.join("weights.hdf5").get()
     with tf.variable_scope('', reuse=tf.AUTO_REUSE):
         bilm = BidirectionalLanguageModel(options_file, weight_file)
         embedding_op = bilm(x_input)
         if mode == "weighted":
             return all_layers(embedding_op)
         else:
             context_input = weight_layers('input',
                                           embedding_op,
                                           l2_coef=0.0,
                                           use_top_only=(mode == "top"))
             return context_input['weighted_op']
示例#25
0
def get_elmo_embeddings(config):

    batcher = Batcher(config.filename_words, 50)

    token_ids = tf.placeholder('int32', shape=(None, None, 50))
    bilm = BidirectionalLanguageModel(
        config.filename_elmo_options,
        config.filename_elmo_weights,
    )

    elmo_embeddings_op = bilm(token_ids)
    elmo_context_input = weight_layers('input',
                                       elmo_embeddings_op,
                                       l2_coef=0.0)

    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.

        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        train = CoNLLDataset(config.filename_train)
        sents_train = [entry[0] for entry in train]
        sent_ids_train = batcher.batch_sentences(sents_train)

        # Compute ELMo representations (here for the input only, for simplicity).

        elmo_input = sess.run([elmo_context_input['weighted_op']],
                              feed_dict={token_ids: sent_ids_train[0]})
        for batch in sent_ids_train[1:]:
            elmo_input_ = sess.run([elmo_context_input['weighted_op']],
                                   feed_dict={token_ids: batch})
            elmo_input = np.hstack(elmo_input, elmo_input_)

        test = CoNLLDataset(config.filename_test)
        sents_test = [entry[0] for entry in test]
        sent_ids_test = batcher.batch_sentences(sents_test)

        elmo_context_output_ = sess.run([elmo_context_input['weighted_op']],
                                        feed_dict={token_ids: sent_ids_test})

    return elmo_context_input_, elmo_context_output_
示例#26
0
def make_elmo(chars_batched):
    bilm = BidirectionalLanguageModel(
                    options_file="data/elmo_2x4096_512_2048cnn_2xhighway_options.json",
                    weight_file="data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
                    max_batch_size=128)

    lm = bilm(chars_batched)
    word_representations_padded = weight_layers('scalar_mix', lm, l2_coef=0.0)['weighted_op']

    # Strip off multiplication by gamma. Our parser has gamma=1 because there is a
    # projection matrix right after
    word_representations_padded = word_representations_padded.op.inputs[0]

    with tf.variable_scope('', reuse=True):
        elmo_scalar_mix_matrix = tf.get_variable('scalar_mix_ELMo_W')

    tf.global_variables_initializer().run()
    tf.assign(elmo_scalar_mix_matrix, [
        float(sd['elmo.scalar_mix_0.scalar_parameters.0']),
        float(sd['elmo.scalar_mix_0.scalar_parameters.1']),
        float(sd['elmo.scalar_mix_0.scalar_parameters.2'])]).eval()

    # Switch from padded to packed representation
    valid_mask = lm['mask']
    dim_padded = tf.shape(lm['mask'])[:2]
    mask_flat = tf.reshape(lm['mask'], (-1,))
    dim_flat = tf.shape(mask_flat)[:1]
    nonpad_ids = tf.to_int32(tf.where(mask_flat)[:,0])
    word_reps_shape = tf.shape(word_representations_padded)
    word_representations_flat = tf.reshape(word_representations_padded, [-1, int(word_representations_padded.shape[-1])])
    word_representations = tf.gather(word_representations_flat, nonpad_ids)

    projected_annotations = tf.matmul(
        word_representations,
        tf.constant(sd['project_elmo.weight'].numpy().transpose()))

    return projected_annotations, nonpad_ids, dim_flat, dim_padded, valid_mask, lm['lengths']
示例#27
0
    def __init__(
        self,
        request_names=['train', 'valid', 'test'],
        new_names=['train', 'valid', 'test'],
        classes_name='classes',
        op_type='vectorizer',
        op_name='elmo',
        dimension=1024,
        file_type='bin',  #TODO: ?
        options_file='./embeddingsruwiki_pp_1.0_elmo/options.json',  #TODO: ?
        weights_file='./embeddingsruwiki_pp_1.0_elmo/weights.hdf5',  #TODO: ?
        vocab_file='./embeddingsruwiki_pp_1.0_elmo/vocab.txt'  #TODO: ?
    ):
        super().__init__(request_names, new_names, op_type, op_name)
        self.file_type = file_type
        self.classes_name = classes_name
        self.dimension = dimension
        # Location of pretrained LM.
        self.options_file = options_file
        self.weights_file = weights_file
        self.vocab_file = vocab_file
        # Create a Batcher to map text to character ids.
        char_per_token = 50
        self.batcher = Batcher(self.vocab_file, char_per_token)
        # Input placeholders to the biLM.
        self.character_ids = tf.placeholder('int32',
                                            shape=(None, None, char_per_token))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(self.options_file, self.weights_file)

        # Get ops to compute the LM embeddings.
        embeddings_op = bilm(character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_output = weight_layers('elmo_output',
                                         embeddings_op,
                                         l2_coef=0.0)
示例#28
0
    def embedding_layer(self, char_inputs, elmo_model, name=None):
        """
        :param char_inputs: one-hot encoding of sentence
        :param seg_inputs: segmentation feature
        :param config: wither use segmentation feature
        :return: [1, num_steps, embedding size],
        """
        # embedding = []
        # with tf.variable_scope("char_embedding" if not name else name), tf.device('/cpu:0'):
        #     self.char_lookup = tf.get_variable(
        #             name="char_embedding",
        #             shape=[self.num_chars, self.char_dim],
        #             initializer=self.initializer)
        #     embedding.append(tf.nn.embedding_lookup(self.char_lookup, char_inputs))
        #     embed = tf.concat(embedding, axis=-1)

        # load bert embedding

        ops = self.elmo(self.ids)

        elmo_context_input = weight_layers('input', ops, l2_coef=0.0)
        elmo_embedding = elmo_context_input['weighted_op']

        return elmo_embedding
示例#29
0
question_character_ids = tf.placeholder('int32', shape=(None, None, 50))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)

# Get ops to compute the LM embeddings.
context_embeddings_op = bilm(context_character_ids)
question_embeddings_op = bilm(question_character_ids)

# Get an op to compute ELMo (weighted average of the internal biLM layers)
# Our SQuAD model includes ELMo at both the input and output layers
# of the task GRU, so we need 4x ELMo representations for the question
# and context at each of the input and output.
# We use the same ELMo weights for both the question and context
# at each of the input and output.
elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_input = weight_layers(
        'input', question_embeddings_op, l2_coef=0.0
    )

elmo_context_output = weight_layers(
    'output', context_embeddings_op, l2_coef=0.0
)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_output = weight_layers(
        'output', question_embeddings_op, l2_coef=0.0
    )
# We will use "${args.exptdir}/alltrain.epitope.elmo" as the model directory
model_dir = join(args.exptdir, 'alltrain.epitope.elmo', 'best_model')
vocab_file = join(args.exptdir, 'alltrain.epitope.vocab')
options_file = join(model_dir, 'pred.options.json')
weight_file = join(model_dir, 'weights.h5')

# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)

# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
bilm = BidirectionalLanguageModel(options_file, weight_file)

context_embeddings_op = bilm(context_character_ids)

elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
elmo_context_output = weight_layers('output',
                                    context_embeddings_op,
                                    l2_coef=0.0)

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.

    sess.run(tf.global_variables_initializer())

    for trial in range(trial_num):
        t_topdir = join(
            args.datadir, 'trial' +
            str(trial + 1)) if args.trial_num >= 1 else args.datadir

        for dtype in dtypes:
示例#31
0
#input placeholder to the biLM
token_ids = tf.placeholder('int32', shape=(None, None))
y_label = tf.placeholder('float32', shape=(None, None, 17))

#Build the biLM graph
bilm = BidirectionalLanguageModel(options_file,
                                  weight_file,
                                  use_character_inputs=False,
                                  embedding_weight_file=token_embedding_file)

#Get ops to compute the LM embeddings
embeddings_op = bilm(token_ids)

#Get an op to compute ELMo(weighted average of the internal biLM layers)
elmo_input = weight_layers('input', embeddings_op, l2_coef=0.0)

hidden_dim = 512
dropout = 0.5
#Bidirectional layers
fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, 1 - dropout)
bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, 1 - dropout)

##shape(batch_num, length, hs_dim)
(outputs, (fw_st,
           bw_st)) = tf.nn.bidirectional_dynamic_rnn(fw_cell,
                                                     bw_cell,
                                                     elmo_input['weighted_op'],
                                                     dtype=tf.float32,