def __init__(self, session, bilm_params):
        self.params = bilm_params

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(self.params.vocab_file,
                               self.params.max_char_len)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.placeholder(
            'int32', shape=(None, None, self.params.max_char_len))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            self.params.options_file,
            self.params.weights_file,
        )

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 l2_coef=0.0,
                                                 use_top_only=True)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())
    def list_to_token_embeddings(self, outfile_to_dump=None):
        '''
        Given an input vocabulary file, dump all the token embeddings to the
        outfile.  The result can be used as the embedding_weight_file when
        constructing a BidirectionalLanguageModel.
        '''

        #batcher = TokenBatcher(vocab_file)
        vocab = UnicodeCharsVocabulary(self.voc_file_path,
                                       self.max_word_length)
        batcher = Batcher(self.voc_file_path, self.max_word_length)
        embedding_op = self.ops['token_embeddings']
        n_tokens = vocab.size
        embed_dim = int(embedding_op.shape[2])
        embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

        config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            for k in tqdm(range(n_tokens)):
                token = vocab.id_to_word(k)
                char_ids = batcher.batch_sentences([[token]
                                                    ])[0,
                                                       1, :].reshape(1, 1, -1)
                embeddings[k, :] = sess.run(
                    embedding_op, feed_dict={self.ids_placeholder: char_ids})

        with h5py.File(outfile_to_dump, 'w') as fout:
            ds = fout.create_dataset('embedding',
                                     embeddings.shape,
                                     dtype='float32',
                                     data=embeddings)

        return embeddings, vocab._word_to_id
Exemplo n.º 3
0
 def __init__(self, train_corpus_fname, test_corpus_fname,
              vocab_fname, options_fname, pretrain_model_fname,
              model_save_path, max_characters_per_token=30,
              batch_size=32, num_labels=2):
     # Load a corpus.
     super().__init__(train_corpus_fname=train_corpus_fname,
                      tokenized_train_corpus_fname=train_corpus_fname + ".elmo-tokenized",
                      test_corpus_fname=test_corpus_fname,
                      tokenized_test_corpus_fname=test_corpus_fname + ".elmo-tokenized",
                      model_name="elmo", vocab_fname=vocab_fname,
                      model_save_path=model_save_path, batch_size=batch_size)
     # configurations
     self.options_fname = options_fname
     self.pretrain_model_fname = pretrain_model_fname
     self.max_characters_per_token = max_characters_per_token
     self.num_labels = 2 # positive, negative
     self.num_train_steps = (int((len(self.train_data) - 1) / self.batch_size) + 1) * self.num_epochs
     self.eval_every = int(self.num_train_steps / self.num_epochs)  # epoch마다 평가
     # Create a Batcher to map text to character ids.
     # lm_vocab_file = ELMo는 token vocab이 없어도 on-the-fly로 입력 id들을 만들 수 있다
     # 하지만 자주 나오는 char sequence, 즉 vocab을 미리 id로 만들어 놓으면 좀 더 빠른 학습이 가능
     # max_token_length = the maximum number of characters in each token
     self.batcher = Batcher(lm_vocab_file=vocab_fname, max_token_length=self.max_characters_per_token)
     self.training = tf.placeholder(tf.bool)
     # build train graph
     self.ids_placeholder, self.labels_placeholder, self.dropout_keep_prob, self.logits, self.loss = make_elmo_graph(options_fname,
                                                                                                                     pretrain_model_fname,
                                                                                                                     max_characters_per_token,
                                                                                                                     num_labels, tune=True)
Exemplo n.º 4
0
    def __init__(
            self,
            tune_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/tune-ckpt",
            pretrain_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo.model",
            options_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/options.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo-vocab.txt",
            max_characters_per_token=30,
            dimension=256,
            num_labels=2,
            use_notebook=False):

        # configurations
        super().__init__("elmo", dimension, use_notebook)
        self.tokenizer = get_tokenizer("mecab")
        self.batcher = Batcher(lm_vocab_file=vocab_fname,
                               max_token_length=max_characters_per_token)
        self.ids_placeholder, self.elmo_embeddings, self.probs = make_elmo_graph(
            options_fname,
            pretrain_model_fname,
            max_characters_per_token,
            num_labels,
            tune=False)
        # restore model
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(tune_model_fname)
        saver.restore(self.sess, checkpoint_path)
Exemplo n.º 5
0
    def __init__(self, FLAGS, id2word, word2id, emb_matrix, id2char, char2id):
        self.FLAGS = FLAGS
        self.id2word = id2word
        self.word2id = word2id
        self.emb_matrix = emb_matrix
        self.id2char = id2char
        self.char2id = char2id

        self.batcher = Batcher(
            "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo_vocab.txt",
            50)
        self.filters = [(5, 10)]  #change back to 100 after

        self.options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo.json"
        self.weight_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"

        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embedding_layer(emb_matrix)
        self.add_elmo_embedding_layer(self.options_file, self.weight_file)
        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.build_graph()
            self.add_loss()

        # Define trainable parameters, gradient, gradient norm, and clip by gradient norm
        params = tf.trainable_variables(
            "QAModel")  #since only one scope "QAModel"
        gradients = tf.gradients(
            self.loss,
            params)  # d(loss)/d(params) return list of (length len(params))
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(
            gradients,
            5.0)  #return list_clipped, global_norm(here we don't need this)
        self.param_norm = tf.global_norm(params)

        # Define optimizer and updates
        # (updates is what you need to fetch in session.run to do a gradient update)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        #This will increment the global step if global_step is not None
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001)  # you can try other optimizers
        self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                           global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(),
                                              max_to_keep=1)

        self.summaries = tf.summary.merge_all()
Exemplo n.º 6
0
    def __init__(self, model_path):
        vocab_file = os.path.join(model_path, 'vocabs.txt')
        options_file = os.path.join(model_path, 'options.json')
        weight_file = os.path.join(model_path, 'weights.hdf5')
        with open(options_file, "r") as fj:
            options = json.load(fj)
        self.max_characters_per_token = options['char_cnn']['max_characters_per_token']        

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, self.max_characters_per_token)
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(options_file, weight_file)
    def __init__(self, spec: str, vocab_file='./datar/vocab/vocab.txt', max_word_length=50,
                 elmo_output_names: Optional[List] = None,
                 dim: Optional[int] = None, pad_zero: bool = False,
                 concat_last_axis: bool = True, max_token: Optional[int] = None,
                 mini_batch_size: int = 32, **kwargs) -> None:

        self.spec = spec if '://' in spec else str(expand_path(spec))
        self.max_word_length = max_word_length
        self.vocab_file = vocab_file 
        self.batcher = Batcher(self.vocab_file, self.max_word_length)
        self.pad_zero = pad_zero
        self.concat_last_axis = concat_last_axis
        self.max_token = max_token
        self.mini_batch_size = mini_batch_size
        self.elmo_outputs, self.sess, self.ids_placeholder = self._load()
Exemplo n.º 8
0
    def __init__(self, params):

        self.data_path = params.data_path
        self.params = params

        if params.IS_DEBUG:
            print('debug mode')
            # load data for debugging
            self.train = self.load_data(self.data_path +
                                        self.params.DATA_DEBUG)
            self.dev = self.load_data(self.data_path + self.params.DATA_DEBUG)
            self.test = self.load_data(self.data_path + self.params.DATA_DEBUG)

        else:
            # load data
            self.train = self.load_data(self.data_path +
                                        self.params.DATA_TRAIN)
            self.dev = self.load_data(self.data_path + self.params.DATA_DEV)
            self.test = self.load_data(self.data_path + self.params.DATA_TEST)

        # batcher for ELMo
        if self.params.USE_CHAR_ELMO:
            print('[INFO] character-level ELMo')
            self.batcher = Batcher(self.data_path + self.params.DIC, 50)
        else:
            print('[INFO] cached-token-level ELMo')
            self.batcher = TokenBatcher(self.data_path + self.params.DIC)

        self.dic_size = 0
        with open(self.data_path + self.params.DIC, 'r') as f:
            self.dic = f.readlines()
            self.dic = [x.strip() for x in self.dic]
            self.dic_size = len(self.dic)

        print('[completed] load data, dic_size: ', self.dic_size)
Exemplo n.º 9
0
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')
    with open(options_file, 'r') as f:
        m_options = json.load(f)
    max_chars = m_options['char_cnn']['max_characters_per_token']

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, max_chars)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.compat.v1.placeholder('int32', shape=(None, None, max_chars))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=128)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
Exemplo n.º 10
0
def load_elmo_embeddings(directory, top=True):
    if os.path.isfile(os.path.join(directory, 'vocab.txt.gz')):
        vocab_file = os.path.join(directory, 'vocab.txt.gz')
    elif os.path.isfile(os.path.join(directory, 'vocab.txt')):
        vocab_file = os.path.join(directory, 'vocab.txt')
    else:
        raise SystemExit('Error: no vocabulary file found in the directory.')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file,
                                      weight_file,
                                      max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    # Our model includes ELMo at both the input and output layers
    # of the task GRU, so we need 2x ELMo representations at each of the input and output.

    elmo_sentence_input = weight_layers('input',
                                        sentence_embeddings_op,
                                        use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
Exemplo n.º 11
0
def load_elmo_embeddings(directory, top=False):
    """
    :param directory: directory with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt.gz')
    :param top: use ony top ELMo layer
    :return: ELMo batcher, character id placeholders, op object
    """
    vocab_file = os.path.join(directory, 'vocab.txt.gz')
    options_file = os.path.join(directory, 'options.json')
    weight_file = os.path.join(directory, 'model.hdf5')

    # Create a Batcher to map text to character ids.
    batcher = Batcher(vocab_file, 50)

    # Input placeholders to the biLM.
    sentence_character_ids = tf.placeholder('int32', shape=(None, None, 50))

    # Build the biLM graph.
    bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=300)

    # Get ops to compute the LM embeddings.
    sentence_embeddings_op = bilm(sentence_character_ids)

    # Get an op to compute ELMo (weighted average of the internal biLM layers)
    elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top)
    return batcher, sentence_character_ids, elmo_sentence_input
Exemplo n.º 12
0
    def get_feed_dict(self,
                      words,
                      words_raw,
                      labels=None,
                      lr=None,
                      dropout=None):
        char_ids, word_ids = zip(*words)
        self.word = word_ids
        word_ids, sequence_lengths = pad_sequences(
            word_ids, self.config.vocab_words['$pad$'], self.max_word_lengths,
            self.max_sequence_lengths)
        char_ids, word_lengths = pad_sequences(
            char_ids,
            self.config.vocab_chars['$pad$'],
            self.max_word_lengths,
            self.max_sequence_lengths,
            nlevels=2)

        if self.config.use_emlo:
            batcher = Batcher("model_emlo/vocab.txt", 50)
            elmo_char_ids = batcher.batch_sentences(words_raw,
                                                    self.max_sequence_lengths)
        # build feed dictionary
        feed = {
            self.word_ids: word_ids,
            self.sequence_lengths: sequence_lengths
        }

        if self.config.use_char_cnn or self.config.use_char_lstm:
            feed[self.char_ids] = char_ids
            feed[self.word_lengths] = word_lengths
        if self.config.use_emlo:
            feed[self.char_ids_elmo] = elmo_char_ids

        if labels is not None:
            labels, _ = pad_sequences(labels, 0, self.max_word_lengths,
                                      self.max_sequence_lengths)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
Exemplo n.º 13
0
 def __init__(self, config):
     super(NERModel, self).__init__(config)
     self.idx_to_tag = {
         idx: tag
         for tag, idx in list(self.config.vocab_tags.items())
     }
     if self.config.use_elmo:
         # self.elmo_inputs = []
         self.batcher = Batcher(self.config.filename_words, 50)
         self.bilm = BidirectionalLanguageModel(
             self.config.filename_elmo_options,
             self.config.filename_elmo_weights)
         self.elmo_token_ids = tf.placeholder('int32',
                                              shape=(None, None, 50))
         self.elmo_embeddings_op = self.bilm(self.elmo_token_ids)
         self.elmo_embeddings_input = weight_layers('input',
                                                    self.elmo_embeddings_op,
                                                    l2_coef=0.0)
Exemplo n.º 14
0
def get_batcher():
    with open(FLAGS.elmo_options, 'r') as fin:
        options = json.load(fin)

    max_word_length = options['char_cnn']['max_characters_per_token']

    elmo_batcher = Batcher(FLAGS.elmo_vocab, max_word_length)

    return elmo_batcher
Exemplo n.º 15
0
def get_elmo_embeddings(config):

    batcher = Batcher(config.filename_words, 50)

    token_ids = tf.placeholder('int32', shape=(None, None, 50))
    bilm = BidirectionalLanguageModel(
        config.filename_elmo_options,
        config.filename_elmo_weights,
    )

    elmo_embeddings_op = bilm(token_ids)
    elmo_context_input = weight_layers('input',
                                       elmo_embeddings_op,
                                       l2_coef=0.0)

    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.

        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        train = CoNLLDataset(config.filename_train)
        sents_train = [entry[0] for entry in train]
        sent_ids_train = batcher.batch_sentences(sents_train)

        # Compute ELMo representations (here for the input only, for simplicity).

        elmo_input = sess.run([elmo_context_input['weighted_op']],
                              feed_dict={token_ids: sent_ids_train[0]})
        for batch in sent_ids_train[1:]:
            elmo_input_ = sess.run([elmo_context_input['weighted_op']],
                                   feed_dict={token_ids: batch})
            elmo_input = np.hstack(elmo_input, elmo_input_)

        test = CoNLLDataset(config.filename_test)
        sents_test = [entry[0] for entry in test]
        sent_ids_test = batcher.batch_sentences(sents_test)

        elmo_context_output_ = sess.run([elmo_context_input['weighted_op']],
                                        feed_dict={token_ids: sent_ids_test})

    return elmo_context_input_, elmo_context_output_
    def list_to_embeddings_with_dump(self,
                                     batch: List[List[str]],
                                     outfile_to_dump=None):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        document_embeddings = []

        if batch == [[]]:
            raise ValueError('Batch should not be empty')
        else:

            if self.word_embedding_file is None:
                batcher = Batcher(self.voc_file_path, self.max_word_length)
            else:
                batcher = TokenBatcher(self.voc_file_path)
            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                ids_list = batcher.batch_sentences(batch)
                with h5py.File(outfile_to_dump, 'w') as fout:
                    for i, ids in enumerate(tqdm(ids_list,
                                                 total=len(ids_list))):
                        _ops = sess.run(
                            self.ops, feed_dict={self.ids_placeholder: [ids]})
                        mask = _ops['mask']
                        lm_embeddings = _ops['lm_embeddings'][0, :]
                        token_embeddings = _ops['token_embeddings']
                        lengths = _ops['lengths']
                        length = int(mask.sum())
                        document_embeddings.append(lm_embeddings)
                        ds = fout.create_dataset('{}'.format(i),
                                                 lm_embeddings.shape,
                                                 dtype='float32',
                                                 data=lm_embeddings)
                document_embeddings = np.asarray(document_embeddings)
        return document_embeddings
    def list_to_embeddings(self, batch: List[List[str]], slice=None):
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        """
        elmo_embeddings = []

        if batch == [[]]:
            if slice is None:
                elmo_embeddings.append(empty_embedding(self.dims))
            else:
                if slice > 2:
                    raise ValueError('Slice can not be larger than 3')
                elmo_embeddings.append(empty_embedding(self.dims, True))
        else:
            batcher = Batcher(self.voc_file_path, self.max_word_length)
            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                for i, _contents in enumerate(tqdm(batch, total=len(batch))):
                    char_ids = batcher.batch_sentences([_contents])
                    _ops = sess.run(self.ops,
                                    feed_dict={self.ids_placeholder: char_ids})
                    mask = _ops['mask']
                    lm_embeddings = _ops['lm_embeddings']
                    token_embeddings = _ops['token_embeddings']
                    lengths = _ops['lengths']
                    length = int(mask.sum())
                    if slice is None:
                        lm_embeddings_mean = np.apply_over_axes(
                            np.mean, lm_embeddings[0], (0, 1))
                    else:
                        lm_embeddings_mean = np.apply_over_axes(
                            np.mean, lm_embeddings[0][slice], (0))
                    elmo_embeddings.append(lm_embeddings_mean)

        return elmo_embeddings
Exemplo n.º 18
0
    def _load_embeddings(self,
                         vocab="vocab.txt",
                         options="elmo_options.json",
                         weights="elmo_weights.hdf5"):
        self.elmo_model = BidirectionalLanguageModel(options, weights)
        self.batcher = Batcher(vocab, 50)

        self.character_ids = tf.placeholder('int32', shape=(None, None, 50))
        context_embeddings_op = self.elmo_model(self.character_ids)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)

        tf.global_variables_initializer().run()
Exemplo n.º 19
0
    def __init__(self,
                 vocab_file,
                 max_seq_length,
                 max_token_length=None,
                 stroke_vocab_file=None,
                 tran2sim=False,
                 sim2tran=False):
        self.vocab_file = vocab_file
        self.max_seq_length = max_seq_length
        self.max_token_length = max_token_length

        max_seq_length = self.max_seq_length - 2  # 因會加 <bos> and <eos>,所以 -2
        self.token_batcher = TokenBatcher(self.vocab_file, max_seq_length)
        if max_token_length:
            self.batcher = Batcher(self.vocab_file, self.max_token_length,
                                   max_seq_length, stroke_vocab_file)

        self.convert_config = None
        if tran2sim and sim2tran:
            assert tran2sim != sim2tran
        elif tran2sim:
            self.convert_config = "t2s.json"
        elif sim2tran:
            self.convert_config = "s2t.json"
Exemplo n.º 20
0
def prepro(config):
    word_counter, char_counter = Counter(), Counter()
    train_examples, train_eval = process_file(config.train_file, "train",
                                              word_counter, char_counter)
    dev_examples, dev_eval = process_file(config.dev_file, "dev", word_counter,
                                          char_counter)
    test_examples, test_eval = process_file(config.test_file, "test",
                                            word_counter, char_counter)

    word_emb_file = config.fasttext_file if config.fasttext else config.glove_word_file
    char_emb_file = config.glove_char_file if config.pretrained_char else None
    char_emb_size = config.glove_char_size if config.pretrained_char else None
    char_emb_dim = config.glove_dim if config.pretrained_char else config.char_dim

    word_emb_mat, word2idx_dict = get_embedding(word_counter,
                                                "word",
                                                emb_file=word_emb_file,
                                                size=config.glove_word_size,
                                                vec_size=config.glove_dim)
    char_emb_mat, char2idx_dict = get_embedding(char_counter,
                                                "char",
                                                emb_file=char_emb_file,
                                                size=char_emb_size,
                                                vec_size=char_emb_dim)

    batcher = Batcher(config.elmo_vocab_file, config.cont_char_limit)

    build_features(config, train_examples, "train", config.train_record_file,
                   word2idx_dict, char2idx_dict, False, batcher)
    dev_meta = build_features(config, dev_examples, "dev",
                              config.dev_record_file, word2idx_dict,
                              char2idx_dict, False, batcher)
    test_meta = build_features(config, test_examples, "test",
                               config.test_record_file, word2idx_dict,
                               char2idx_dict, True, batcher)

    save(config.word_emb_file, word_emb_mat, message="word embedding")
    save(config.char_emb_file, char_emb_mat, message="char embedding")
    save(config.train_eval_file, train_eval, message="train eval")
    save(config.dev_eval_file, dev_eval, message="dev eval")
    save(config.test_eval_file, test_eval, message="test eval")
    save(config.dev_meta, dev_meta, message="dev meta")
    save(config.test_meta, test_meta, message="test meta")
    save(config.dev_meta, dev_meta, message="dev meta")
    save(config.test_meta, test_meta, message="test meta")
    save(config.word_dictionary, word2idx_dict, message="word dictionary")
    save(config.char_dictionary, char2idx_dict, message="char dictionary")
Exemplo n.º 21
0
class ElmoEmbedding:
    def __init__(self, model_path):
        vocab_file = os.path.join(model_path, 'vocabs.txt')
        options_file = os.path.join(model_path, 'options.json')
        weight_file = os.path.join(model_path, 'weights.hdf5')
        with open(options_file, "r") as fj:
            options = json.load(fj)
        self.max_characters_per_token = options['char_cnn']['max_characters_per_token']        

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, self.max_characters_per_token)
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(options_file, weight_file)


    def __call__(self, tokenized_sentences_lst):
        # Input placeholders to the biLM.
        context_character_ids = tf.placeholder('int32', shape=(None, None, self.max_characters_per_token))

        # Get ops to compute the LM embeddings.
        context_embeddings_op = self.bilm(context_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
        elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)

        # Now we can compute embeddings.
        context_tokens  = [sentence.split() for sentence in tokenized_sentences_lst]

        with tf.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.global_variables_initializer())

            # Create batches of data.
            context_ids = self.batcher.batch_sentences(context_tokens)

            # Compute ELMo representations (here for the input only, for simplicity).
            elmo_context_vecs = sess.run(
            [elmo_context_input['weighted_op']],
            feed_dict={context_character_ids: context_ids}
            )

        return elmo_context_vecs[0]  #, context_tokens, context_ids
class ELMoRunner:
    def __init__(self, session, bilm_params):
        self.params = bilm_params

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(self.params.vocab_file,
                               self.params.max_char_len)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.placeholder(
            'int32', shape=(None, None, self.params.max_char_len))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            self.params.options_file,
            self.params.weights_file,
        )

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 l2_coef=0.0,
                                                 use_top_only=True)

        self.sess = session
        self.sess.run(tf.global_variables_initializer())

    def preprocess(self, sentences_words):
        return self.batcher.batch_sentences(sentences_words)

    def __call__(self, batch_sentence_ids):
        (elmo_sentence_input_, ) = self.sess.run(
            [self.elmo_sentence_input['weighted_op']],
            feed_dict={self.sentence_character_ids: batch_sentence_ids})
        return elmo_sentence_input_
Exemplo n.º 23
0
    def __init__(
        self,
        request_names=['train', 'valid', 'test'],
        new_names=['train', 'valid', 'test'],
        classes_name='classes',
        op_type='vectorizer',
        op_name='elmo',
        dimension=1024,
        file_type='bin',  #TODO: ?
        options_file='./embeddingsruwiki_pp_1.0_elmo/options.json',  #TODO: ?
        weights_file='./embeddingsruwiki_pp_1.0_elmo/weights.hdf5',  #TODO: ?
        vocab_file='./embeddingsruwiki_pp_1.0_elmo/vocab.txt'  #TODO: ?
    ):
        super().__init__(request_names, new_names, op_type, op_name)
        self.file_type = file_type
        self.classes_name = classes_name
        self.dimension = dimension
        # Location of pretrained LM.
        self.options_file = options_file
        self.weights_file = weights_file
        self.vocab_file = vocab_file
        # Create a Batcher to map text to character ids.
        char_per_token = 50
        self.batcher = Batcher(self.vocab_file, char_per_token)
        # Input placeholders to the biLM.
        self.character_ids = tf.placeholder('int32',
                                            shape=(None, None, char_per_token))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(self.options_file, self.weights_file)

        # Get ops to compute the LM embeddings.
        embeddings_op = bilm(character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_output = weight_layers('elmo_output',
                                         embeddings_op,
                                         l2_coef=0.0)
Exemplo n.º 24
0
    def __init__(self,
                 args,
                 is_training=True,
                 emb_class='glove',
                 use_crf=True):
        self.emb_path = args.emb_path
        self.embvec = pkl.load(open(
            self.emb_path, 'rb'))  # resources(glove, vocab, path, etc)
        self.wrd_dim = args.wrd_dim  # size of word embedding(glove)
        self.chr_dim = 25  # size of character embedding
        self.pos_dim = 7  # size of part of speech embedding
        self.class_size = len(self.embvec.tag_vocab)  # number of class(tags)
        self.word_length = args.word_length  # maximum character size of word for convolution
        self.restore = args.restore  # checkpoint path if available
        self.use_crf = use_crf  # use crf decoder or not
        self.emb_class = emb_class  # class of embedding(glove, elmo, bert)

        self.keep_prob = 0.7  # keep probability for dropout
        self.chr_conv_type = 'conv1d'  # conv1d | conv2d
        self.filter_sizes = [3]  # filter sizes
        self.num_filters = 53  # number of filters
        self.highway_used = False  # use highway network on the concatenated input
        self.rnn_used = True  # use rnn layer or not
        self.rnn_num_layers = 2  # number of RNN layers
        self.rnn_type = 'fused'  # normal | fused
        self.rnn_size = 200  # size of RNN hidden unit
        self.tf_used = False  # use transformer encoder layer or not
        if self.tf_used:
            # modified for transformer
            self.starter_learning_rate = 0.0003
        self.tf_num_layers = 4  # number of layers for transformer encoder
        self.tf_keep_prob = 0.8  # keep probability for transformer encoder
        self.tf_mh_num_heads = 4  # number of head for multi head attention
        self.tf_mh_num_units = 64  # Q,K,V dimension for multi head attention
        self.tf_mh_keep_prob = 0.8  # keep probability for multi head attention
        self.tf_ffn_kernel_size = 3  # conv1d kernel size for feed forward net
        self.tf_ffn_keep_prob = 0.8  # keep probability for feed forward net

        self.starter_learning_rate = 0.001  # default learning rate
        self.decay_steps = 12000
        self.decay_rate = 0.9
        self.clip_norm = 10

        self.is_training = is_training
        if self.is_training:
            self.epoch = args.epoch
            self.batch_size = args.batch_size
            self.dev_batch_size = 2 * self.batch_size
            self.checkpoint_dir = args.checkpoint_dir
            self.summary_dir = args.summary_dir
        '''
        modified for glove(300, 6B), self.tf_used == False
        self.rnn_size = 276
        self.keep_prob = 0.32
        '''

        if self.emb_class == 'elmo':
            from bilm import Batcher, BidirectionalLanguageModel
            self.word_length = 50  # replace to fixed word length for the pre-trained elmo : 'max_characters_per_token'
            self.elmo_batcher = Batcher(
                self.embvec.elmo_vocab_path,
                self.word_length)  # map text to character ids
            self.elmo_bilm = BidirectionalLanguageModel(
                self.embvec.elmo_options_path,
                self.embvec.elmo_weight_path)  # biLM graph
            self.elmo_keep_prob = 0.7
            # modified for elmo
            self.highway_used = False
        if self.emb_class == 'bert':
            from bert import modeling
            from bert import tokenization
            self.bert_config = modeling.BertConfig.from_json_file(
                self.embvec.bert_config_path)
            self.bert_tokenizer = tokenization.FullTokenizer(
                vocab_file=self.embvec.bert_vocab_path,
                do_lower_case=self.embvec.bert_do_lower_case)
            self.bert_init_checkpoint = self.embvec.bert_init_checkpoint
            self.bert_max_seq_length = self.embvec.bert_max_seq_length
            self.bert_keep_prob = 0.8
            # modified for bert
            self.highway_used = False
            self.rnn_size = 256
            self.starter_learning_rate = 2e-5
            self.decay_steps = 5000
            self.decay_rate = 0.9
            self.clip_norm = 1.5
            self.use_bert_optimization = False
            self.num_train_steps = 0  # number of total training steps
            self.num_warmup_steps = 0  # number of warmup steps
            self.warmup_proportion = 0.1  # proportion of training to perform linear learning rate warmup for
            if self.is_training:
                self.dev_batch_size = self.batch_size  # set batch_size == dev_batch_size
Exemplo n.º 25
0
Below, we show usage for SQuAD where each input example consists of both
a question and a paragraph of context.
'''

import tensorflow as tf
import os
from bilm import Batcher, BidirectionalLanguageModel, weight_layers

# Location of pretrained LM.  Here we use the test fixtures.
datadir = os.path.join('tests', 'fixtures', 'model')
vocab_file = os.path.join(datadir, 'vocab_test.txt')
options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'lm_weights.hdf5')

# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)

# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
question_character_ids = tf.placeholder('int32', shape=(None, None, 50))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)

# Get ops to compute the LM embeddings.
context_embeddings_op = bilm(context_character_ids)
question_embeddings_op = bilm(question_character_ids)

# Get an op to compute ELMo (weighted average of the internal biLM layers)
# Our SQuAD model includes ELMo at both the input and output layers
# of the task GRU, so we need 4x ELMo representations for the question
Exemplo n.º 26
0
class QAModel(object):
    def __init__(self, FLAGS, id2word, word2id, emb_matrix, id2char, char2id):
        self.FLAGS = FLAGS
        self.id2word = id2word
        self.word2id = word2id
        self.emb_matrix = emb_matrix
        self.id2char = id2char
        self.char2id = char2id

        self.batcher = Batcher(
            "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo_vocab.txt",
            50)
        self.filters = [(5, 10)]  #change back to 100 after

        self.options_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/elmo.json"
        self.weight_file = "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad/data/elmo/lm_weight.hdf5"

        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.add_placeholders()
            self.add_embedding_layer(emb_matrix)
        self.add_elmo_embedding_layer(self.options_file, self.weight_file)
        with tf.variable_scope(
                "QAModel",
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, uniform=True)):
            self.build_graph()
            self.add_loss()

        # Define trainable parameters, gradient, gradient norm, and clip by gradient norm
        params = tf.trainable_variables(
            "QAModel")  #since only one scope "QAModel"
        gradients = tf.gradients(
            self.loss,
            params)  # d(loss)/d(params) return list of (length len(params))
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(
            gradients,
            5.0)  #return list_clipped, global_norm(here we don't need this)
        self.param_norm = tf.global_norm(params)

        # Define optimizer and updates
        # (updates is what you need to fetch in session.run to do a gradient update)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        #This will increment the global step if global_step is not None
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001)  # you can try other optimizers
        self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                           global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(),
                                              max_to_keep=1)

        self.summaries = tf.summary.merge_all()

    def add_placeholders(self):
        self.context_ids = tf.placeholder(tf.int32)
        self.context_mask = tf.placeholder(tf.int32)
        self.qn_ids = tf.placeholder(tf.int32)
        self.qn_mask = tf.placeholder(tf.int32)
        self.ans_span = tf.placeholder(tf.int32, shape=[None, 2])

        #NOTE:CHANGE
        #self.context_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        #The following two may not be necessary
        #self.context_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.context_len, self.FLAGS.max_word_len])
        #self.qn_char_mask = tf.placeholder(tf.int32, shape=[None, self.FLAGS.question_len, self.FLAGS.max_word_len])
        self.context_elmo = tf.placeholder('int32', shape=[None, None, 50])
        self.qn_elmo = tf.placeholder('int32', shape=[None, None, 50])

        # Add a placeholder to feed in the keep probability (for dropout).
        # This is necessary so that we can instruct the model to use dropout when training, but not when testing
        self.keep_prob = tf.placeholder_with_default(1.0, shape=())

    def add_embedding_layer(self, emb_matrix):
        with tf.variable_scope("embeddings"):
            #set to constant so its untrainable
            embedding_matrix = tf.constant(
                emb_matrix, dtype=tf.float32,
                name="emb_matrix")  # shape (400002, embedding_size)

            # Get the word embeddings for the context and question,
            self.context_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                       self.context_ids)
            self.qn_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                  self.qn_ids)

        #self.add_char_embedding_layer()

    def add_elmo_embedding_layer(self,
                                 options_file,
                                 weight_file,
                                 output_use=False):
        """
        Adds ELMo lstm embeddings to the graph.

        Inputs:
            options_file: json_file for the pretrained model
            weight_file: weights hdf5 file for the pretrained model
            output_use: determine if use elmo in output of biRNN (default False)
        """
        #Build biLM graph
        bilm = BidirectionalLanguageModel(options_file, weight_file)
        context_embeddings_op = bilm(self.context_elmo)
        question_embeddings_op = bilm(self.qn_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        #compute the final ELMo representations.
        self.elmo_context_input = weight_layers(
            'input', context_embeddings_op,
            l2_coef=0.001)['weighted_op']  #(batch size, context size, ????)
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            self.elmo_question_input = weight_layers(
                'input', question_embeddings_op, l2_coef=0.001)['weighted_op']

        if output_use:
            self.elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.001)['weighted_op']
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                self.elmo_question_output = weight_layers(
                    'output', question_embeddings_op,
                    l2_coef=0.001)['weighted_op']

    def build_graph(self):
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        #qn_embs_concat.set_shape((None, None, 1024+self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("start"):
            softmax_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_start.build_graph(
                context_embs_concat, self.context_mask)
        with tf.variable_scope("end"):
            softmax_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_end.build_graph(
                context_embs_concat, self.context_mask)

    def add_loss(self):
        with tf.variable_scope("loss"):
            # Calculate loss for prediction of start position
            loss_start = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_start,
                labels=self.ans_span[:,
                                     0])  # loss_start has shape (batch_size)
            self.loss_start = tf.reduce_mean(loss_start)
            tf.summary.scalar('loss_start', self.loss_start)

            # Calculate loss for prediction of end position
            loss_end = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits_end, labels=self.ans_span[:, 1])
            self.loss_end = tf.reduce_mean(loss_end)
            tf.summary.scalar('loss_end', self.loss_end)

            # Add two losses
            self.loss = self.loss_start + self.loss_end
            tf.summary.scalar('loss', self.loss)

    def run_train_iter(self, session, batch, summary_writer):
        input_feed = {}
        input_feed[self.context_ids] = batch.context_ids
        input_feed[self.context_mask] = batch.context_mask

        #NOTE: CHANGE added context_char
        #input_feed[self.context_char] = batch.context_char
        input_feed[self.context_elmo] = self.batcher.batch_sentences(
            batch.context_tokens)

        input_feed[self.qn_ids] = batch.qn_ids
        input_feed[self.qn_mask] = batch.qn_mask

        #NOTE: CHANGE added qn_char
        #input_feed[self.qn_char] = batch.qn_char
        input_feed[self.qn_elmo] = self.batcher.batch_sentences(
            batch.qn_tokens)

        input_feed[self.ans_span] = batch.ans_span
        input_feed[self.keep_prob] = 1.0 - self.FLAGS.dropout  # apply dropout

        output_feed = [
            self.updates, self.summaries, self.loss, self.global_step,
            self.param_norm, self.gradient_norm
        ]

        #output_feed = [self.elmo_context_input]
        [_, summaries, loss, global_step, param_norm,
         gradient_norm] = sess.run(output_feed, feed_dict=input_feed)

        print("FINISHED")

    def train(self, session, train_context_path, train_qn_path, train_ans_path,
              dev_qn_path, dev_context_path, dev_ans_path):
        summary_writer = tf.summary.FileWriter(
            "/Users/lam/Desktop/Lam-cs224n/Projects/qa/squad", session.graph)
        for batch in get_batch_generator(self.word2id,
                                         self.char2id,
                                         train_context_path,
                                         train_qn_path,
                                         train_ans_path,
                                         self.FLAGS.batch_size,
                                         self.FLAGS.context_len,
                                         self.FLAGS.question_len,
                                         self.FLAGS.max_word_len,
                                         discard_long=True):
            self.sample_batch = batch

            self.run_train_iter(session, batch, summary_writer)
            break
Exemplo n.º 27
0
args = parse_args()
dtypes = args.dtypes.split(':')
trial_num = max(1, args.trial_num)

###
#args.exptdir = pwd/data
#args.datadir = trial

# We will use "${args.exptdir}/alltrain.epitope.elmo" as the model directory
model_dir = join(args.exptdir, 'alltrain.epitope.elmo', 'best_model')
vocab_file = join(args.exptdir, 'alltrain.epitope.vocab')
options_file = join(model_dir, 'pred.options.json')
weight_file = join(model_dir, 'weights.h5')

# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)

# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
bilm = BidirectionalLanguageModel(options_file, weight_file)

context_embeddings_op = bilm(context_character_ids)

elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
elmo_context_output = weight_layers('output',
                                    context_embeddings_op,
                                    l2_coef=0.0)

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
Exemplo n.º 28
0
def build_features(config,
                   examples,
                   data_type,
                   out_file,
                   word2idx_dict,
                   is_test=False):

    para_limit = config.para_limit
    ques_limit = config.ques_limit
    turn_limit = config.turn_limit

    def filter_func(example):
        return len(example["tokenized_context"]
                   ) > para_limit  #or len(example["ques_tokens"]) > ques_limit

    print("Processing {} examples...".format(data_type))
    writer = tf.python_io.TFRecordWriter(out_file)
    total = 0
    total_ = 0
    meta = {}

    max_char_length = config.max_char_length
    batcher = Batcher(config.elmo_vocab_file, max_char_length)
    for example in tqdm(examples):
        total_ += 1

        if filter_func(example):
            continue

        total += 1
        context_idxs = np.zeros([para_limit], dtype=np.int32)
        questions_idxs = np.zeros([turn_limit, ques_limit], dtype=np.int32)
        context_char_idxs = np.zeros([para_limit + 2, max_char_length],
                                     dtype=np.int32)
        questions_char_idxs = np.zeros(
            [turn_limit, ques_limit + 2, max_char_length], dtype=np.int32)
        starts = np.zeros([turn_limit, para_limit], dtype=np.float32)
        ends = np.zeros([turn_limit, para_limit], dtype=np.float32)
        em = np.zeros([turn_limit, para_limit], dtype=np.int32)
        yes_answers = np.zeros([turn_limit], dtype=np.int32)
        no_answers = np.zeros([turn_limit], dtype=np.int32)
        unk_answers = np.zeros([turn_limit], dtype=np.int32)
        span_flag = np.zeros([turn_limit], dtype=np.int32)

        def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1

        def _check_word_in_question(word, question):
            for token in question:
                if word.lower() == token.lower():
                    return True
            return False

        # type: List[str]
        tokenized_context = example["tokenized_context"]
        length = len(tokenized_context) + 2
        context_char_idxs_without_mask = batcher._lm_vocab.encode_chars(
            tokenized_context, split=False)
        context_char_idxs[:length, :] = context_char_idxs_without_mask + 1

        for k, sent in enumerate(example["tokenized_questions"]):
            length = len(sent) + 2
            question_char_idxs_without_mask = batcher._lm_vocab.encode_chars(
                sent, split=False)
            questions_char_idxs[
                k, :length, :] = question_char_idxs_without_mask + 1

        # get em and context indexes vector
        for i, token in enumerate(tokenized_context):
            context_idxs[i] = _get_word(token)
            for j, tokenized_question in enumerate(
                    example["tokenized_questions"]):
                if _check_word_in_question(token, tokenized_question):
                    em[j, i] = 1

        # get question indexes vector
        for i, tokenized_question in enumerate(example["tokenized_questions"]):
            for j, token in enumerate(tokenized_question):
                questions_idxs[i, j] = _get_word(token)

        # get start vector
        for i, idx in enumerate(example["starts"]):
            starts[i, idx] = 1.0

        # get end vector
        for i, idx in enumerate(example["ends"]):
            ends[i, idx] = 1.0

        # get label of yes/no questions
        length = len(example["yes_answers"])
        yes_answers[:length] = example["yes_answers"]
        no_answers[:length] = example["no_answers"]
        unk_answers[:length] = example["unk_answers"]
        span_flag[:length] = example["span_flag"]

        feature_dict = {
            "context_idxs":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[context_idxs.tostring()])),
            "questions_idxs":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[questions_idxs.tostring()])),
            "context_char_idxs":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[context_char_idxs.tostring()])),
            "questions_char_idxs":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[questions_char_idxs.tostring()])),
            "starts":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[starts.tostring()])),
            "ends":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[ends.tostring()])),
            "em":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[em.tostring()])),
            "yes_answers":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[yes_answers.tostring()])),
            "no_answers":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[no_answers.tostring()])),
            "unk_answers":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[unk_answers.tostring()])),
            "span_flag":
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=[span_flag.tostring()]))
        }

        record = tf.train.Example(features=tf.train.Features(
            feature=feature_dict))
        writer.write(record.SerializeToString())

    print("Build {} / {} instances of features in total".format(total, total_))
    meta["total"] = total
    writer.close()
    return meta
Exemplo n.º 29
0
if os.path.isfile(os.path.join(datadir, 'vocab.txt.gz')):
    vocab_file = os.path.join(datadir, 'vocab.txt.gz')
elif os.path.isfile(os.path.join(datadir, 'vocab.txt')):
    vocab_file = os.path.join(datadir, 'vocab.txt')
else:
    print('No vocabulary file found. Exiting...')
    exit()

options_file = os.path.join(datadir, 'options.json')
weight_file = os.path.join(datadir, 'model.hdf5')
with open(options_file, 'r') as f:
    m_options = json.load(f)
max_chars = m_options['char_cnn']['max_characters_per_token']

# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, max_chars)

# Input placeholders to the biLM.
sentence_character_ids = tf.compat.v1.placeholder('int32',
                                                  shape=(None, None,
                                                         max_chars))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file,
                                  weight_file,
                                  max_batch_size=200)

# Get ops to compute the LM embeddings.
sentence_embeddings_op = bilm(sentence_character_ids)

# Get an op to compute ELMo (weighted average of the internal biLM layers)
Exemplo n.º 30
0
class ELMoTuner(Tuner):

    def __init__(self, train_corpus_fname, test_corpus_fname,
                 vocab_fname, options_fname, pretrain_model_fname,
                 model_save_path, max_characters_per_token=30,
                 batch_size=32, num_labels=2):
        # Load a corpus.
        super().__init__(train_corpus_fname=train_corpus_fname,
                         tokenized_train_corpus_fname=train_corpus_fname + ".elmo-tokenized",
                         test_corpus_fname=test_corpus_fname,
                         tokenized_test_corpus_fname=test_corpus_fname + ".elmo-tokenized",
                         model_name="elmo", vocab_fname=vocab_fname,
                         model_save_path=model_save_path, batch_size=batch_size)
        # configurations
        self.options_fname = options_fname
        self.pretrain_model_fname = pretrain_model_fname
        self.max_characters_per_token = max_characters_per_token
        self.num_labels = 2 # positive, negative
        self.num_train_steps = (int((len(self.train_data) - 1) / self.batch_size) + 1) * self.num_epochs
        self.eval_every = int(self.num_train_steps / self.num_epochs)  # epoch마다 평가
        # Create a Batcher to map text to character ids.
        # lm_vocab_file = ELMo는 token vocab이 없어도 on-the-fly로 입력 id들을 만들 수 있다
        # 하지만 자주 나오는 char sequence, 즉 vocab을 미리 id로 만들어 놓으면 좀 더 빠른 학습이 가능
        # max_token_length = the maximum number of characters in each token
        self.batcher = Batcher(lm_vocab_file=vocab_fname, max_token_length=self.max_characters_per_token)
        self.training = tf.placeholder(tf.bool)
        # build train graph
        self.ids_placeholder, self.labels_placeholder, self.dropout_keep_prob, self.logits, self.loss = make_elmo_graph(options_fname,
                                                                                                                        pretrain_model_fname,
                                                                                                                        max_characters_per_token,
                                                                                                                        num_labels, tune=True)

    def tune(self):
        global_step = tf.train.get_or_create_global_step()
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        output_feed = [train_op, global_step, self.logits, self.loss]
        saver = tf.train.Saver(max_to_keep=1)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        self.train(sess, saver, global_step, output_feed)

    def make_input(self, sentences, labels, is_training):
        current_input = self.batcher.batch_sentences(sentences)
        current_output = np.array(labels)
        if is_training:
            input_feed = {
                self.ids_placeholder: current_input,
                self.labels_placeholder: current_output,
                self.dropout_keep_prob: self.dropout_keep_prob_rate,
                self.training: True
            }
        else:
            input_feed_ = {
                self.ids_placeholder: current_input,
                self.labels_placeholder: current_output,
                self.dropout_keep_prob: 1.0,
                self.training: False
            }
            input_feed = [input_feed_, current_output]
        return input_feed
Exemplo n.º 31
0
test_datas, test_sample_num = transform_data(test_datas, all_tokens,
                                             batch_size)
test_batch_num = len(test_datas[0])
test_m_datas, test_m_sample_num = transform_data(test_m_datas, all_tokens,
                                                 batch_size)
test_m_batch_num = len(test_m_datas[0])
test_h_datas, test_h_sample_num = transform_data(test_h_datas, all_tokens,
                                                 batch_size)
test_h_batch_num = len(test_h_datas[0])

# build and save vocab file
with open(vocab_file, 'w') as fout:
    fout.write('\n'.join(all_tokens))

# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50, max_context_length)
batcher2 = Batcher(vocab_file, 50, max_q_o_length)

# *** build models ***

# Input placeholders to the biLM.
context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
question_character_ids = tf.placeholder('int32', shape=(None, None, 50))
options_character_ids = tf.placeholder('int32', shape=(None, None, 50))
context_lengths = tf.placeholder('int32', shape=(None, ))
question_lengths = tf.placeholder('int32', shape=(None, ))
options_lengths = tf.placeholder('int32', shape=(None, ))
labels = tf.placeholder('int32', shape=(None, ))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)