Exemplo n.º 1
0
    def __init__(self, user_interactive_items, user_list_file, item_list_file,
                 sess):
        self.user_items = user_interactive_items
        self.item_table = index_table_from_file(item_list_file)
        self.user_table = index_table_from_file(user_list_file)
        sess.run(tf.tables_initializer())

        self.generate_sparse_tensor_table(sess)
Exemplo n.º 2
0
def create_vocab_tables(src_vocab_file, tgt_vocab_file):
    src_vocab_table = index_table_from_file(
        src_vocab_file, default_value=UNK_ID
    )

    tgt_vocab_table = index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID
    )

    return src_vocab_table, tgt_vocab_table
Exemplo n.º 3
0
 def __init__(self,
              tensor_key,
              tags_id_lookup_file,
              num_classes,
              dtype=tf.float32,
              shape_keys=None,
              shape=None,
              default_value='',
              tags_key_column_index=None,
              tags_value_column_index=None,
              delimiter=None):
     """Initializes the OneHotLabelTensor handler, which decode label text into one-hot encodings."""
     lookup_kwargs = {}
     if tags_key_column_index is not None:
         lookup_kwargs['key_column_index'] = tags_key_column_index
     if tags_value_column_index is not None:
         lookup_kwargs['value_column_index'] = tags_value_column_index
     table = lookup.index_table_from_file(tags_id_lookup_file,
                                          **lookup_kwargs)
     self._table = table
     self._delimiter = delimiter or TAG_TEXT_DELIMITER
     self._num_classes = num_classes
     self._dtype = dtype
     super(OneHotLabelTensor, self).__init__(tensor_key, shape_keys, shape,
                                             default_value)
Exemplo n.º 4
0
 def _apply_vocab(y, deferred_vocab_filename_tensor):
   table = lookup.index_table_from_file(
       deferred_vocab_filename_tensor,
       num_oov_buckets=num_oov_buckets,
       default_value=default_value)
   table_size = table.size()
   return table.lookup(y), table_size
Exemplo n.º 5
0
 def __init__(self, config):
     self.config = config
     self.vocabulary = lookup.index_table_from_file(self.config.vocabulary_file, num_oov_buckets=0, default_value=0)
     self.pad_id = self.vocabulary.lookup(tf.constant(self.config.pad_sign))
     self.sentence_data = self.load_dataset_from_text(self.config.sentence_data_file_path)
     self.label_data = self.load_dataset_from_text(self.config.label_data_file_path)
     self.input_fn(self.sentence_data, self.label_data)
Exemplo n.º 6
0
def linear_model(features, target, mode):
  # make input features numeric
  from tensorflow.contrib import lookup
  table = lookup.index_table_from_file(
        vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, vocab_size=N_WORDS, default_value=-1, name="word_to_index")
  titles = tf.squeeze(features['title'], [1])
  words = tf.string_split(titles)
  words = tf.sparse_tensor_to_dense(words, default_value='ZYXW')
  words = table.lookup(words)
  print('lookup_words={}'.format(words))

  # each row has variable length of words
  # take the first MAX_DOCUMENT_LENGTH words (pad shorter titles to this)
  padding = tf.stack([tf.zeros_like(titles,dtype=tf.int64),tf.ones_like(titles,dtype=tf.int64)*MAX_DOCUMENT_LENGTH])
  words = tf.pad(words, padding)
  words = tf.slice(words, [0,0], [-1,MAX_DOCUMENT_LENGTH])
  print('words_sliced={}'.format(words))  # (?, 20)

  # embed the words in a common way
  words = tf.contrib.layers.embed_sequence(
      words, vocab_size=(N_WORDS+1), embed_dim=EMBEDDING_SIZE, scope='words')
  print('words_embed={}'.format(words)) # (?, 20, 10)

  # now do convolution
  conv = tf.contrib.layers.convolution2d(
           words, 5, [3, EMBEDDING_SIZE] , padding='VALID')
  conv = tf.nn.relu(conv1)
  words = tf.nn.max_pool(conv,
        ksize=[1, POOLING_WINDOW, 1, 1],
        strides=[1, POOLING_STRIDE, 1, 1],
        padding='SAME')
  print('words_conv={}'.format(words)) # 

  n_classes = len(TARGETS)

  logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None)
  print('logits={}'.format(logits))
  logits = tf.squeeze(logits, squeeze_dims=[1]) # from (?,1,3) to (?,3)
  predictions_dict = {
      'source': tf.gather(TARGETS, tf.argmax(logits, 1)),
      'class': tf.argmax(logits, 1),
      'prob': tf.nn.softmax(logits)
  }

  if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
     loss = tf.losses.sparse_softmax_cross_entropy(target, logits)
     train_op = tf.contrib.layers.optimize_loss(
       loss,
       tf.contrib.framework.get_global_step(),
       optimizer='Adam',
       learning_rate=0.01)
  else:
     loss = None
     train_op = None

  return tflearn.ModelFnOps(
      mode=mode,
      predictions=predictions_dict,
      loss=loss,
      train_op=train_op)
Exemplo n.º 7
0
    def __init__(self,
                 batch_size,
                 vocabularies,
                 embedding_dim,
                 num_oov_buckets=1000,
                 fine_tune_embeddings=False,
                 padded_token=None,
                 name='embed_and_pad'):
        super(EmbedAndPad, self).__init__(name=name)
        self._batch_size = batch_size
        vocab_file, vocab_size = get_merged_vocabulary_file(
            vocabularies, padded_token)
        self._vocab_size = vocab_size
        self._num_oov_buckets = num_oov_buckets

        # Load vocabulary table for index lookup.
        self._vocabulary_table = contrib_lookup.index_table_from_file(
            vocabulary_file=vocab_file,
            num_oov_buckets=num_oov_buckets,
            vocab_size=self._vocab_size)

        def create_initializer(initializer_range=0.02):
            """Creates a `truncated_normal_initializer` with the given range."""
            # The default value is chosen from language/bert/modeling.py.
            return tf.truncated_normal_initializer(stddev=initializer_range)

        self._embeddings = tf.get_variable(
            'embeddings_matrix',
            [self._vocab_size + num_oov_buckets, embedding_dim],
            trainable=fine_tune_embeddings,
            initializer=create_initializer())
Exemplo n.º 8
0
def get_lookup_table(element_file_path, oov_buckets, size=None, device='/cpu:0', name='lookup_table'):
    with tf.device(device):
        return lookup.index_table_from_file(vocabulary_file=element_file_path,
                                                      num_oov_buckets=oov_buckets,
                                                      vocab_size=size,
                                                      default_value=-1,  # -1 is always the padding value
                                                      name=name)
Exemplo n.º 9
0
def custom_fast_text(features, labels, mode, params):
    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.CNN_MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.CNN_MAX_DOCUMENT_LENGTH])

    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 20, input_length=commons.CNN_MAX_DOCUMENT_LENGTH)(
        word_id_vector)
    f1 = tf.keras.layers.GlobalMaxPooling1D()(embedded_sequences)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1)

    predictions = tf.nn.sigmoid(logits)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': tf.cast(tf.map_fn(lambda x: tf.cond(x > 0.30, lambda: 1.0, lambda: 0.0),
                                       tf.squeeze(predictions)), dtype=tf.int32),


        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)

    loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, logits=logits)

    tf.summary.scalar('loss', loss)

    acc = tf.equal(tf.cast(predictions, dtype=tf.int32), labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions)
        }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 10
0
def cnn_model(features, target, mode):
    table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE,
                                         num_oov_buckets=1,
                                         default_value=-1)

    # string operations
    titles = tf.squeeze(features['title'], [1])
    words = tf.string_split(titles)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table.lookup(densewords)
    padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH])
    print('words_sliced={}'.format(words))  # (?, 20)

    # layer to take the words and convert them into vectors (embeddings)
    embeds = tf.contrib.layers.embed_sequence(sliced,
                                              vocab_size=N_WORDS,
                                              embed_dim=EMBEDDING_SIZE)
    print('words_embed={}'.format(embeds))  # (?, 20, 10)

    # now do convolution
    conv = tf.contrib.layers.conv2d(embeds,
                                    1,
                                    WINDOW_SIZE,
                                    stride=STRIDE,
                                    padding='SAME')  # (?, 4, 1)
    conv = tf.nn.relu(conv)  # (?, 4, 1)
    words = tf.squeeze(conv, [2])  # (?, 4)
    print('words_conv={}'.format(words))  # (?, 4)

    n_classes = len(TARGETS)

    logits = tf.contrib.layers.fully_connected(words,
                                               n_classes,
                                               activation_fn=None)
    #print('logits={}'.format(logits)) # (?, 3)
    predictions_dict = {
        'source': tf.gather(TARGETS, tf.argmax(logits, 1)),
        'class': tf.argmax(logits, 1),
        'prob': tf.nn.softmax(logits)
    }

    if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
        loss = tf.losses.sparse_softmax_cross_entropy(target, logits)
        train_op = tf.contrib.layers.optimize_loss(
            loss,
            tf.contrib.framework.get_global_step(),
            optimizer='Adam',
            learning_rate=0.01)
    else:
        loss = None
        train_op = None

    return tflearn.ModelFnOps(mode=mode,
                              predictions=predictions_dict,
                              loss=loss,
                              train_op=train_op)
    def _build(self):
        words, nwords = self.inp  # words:输入的文本, nwords:输入文本的长度
        vocab_words = index_table_from_file(
            self.params.vocab,
            num_oov_buckets=self.params.num_oov_buckets)  # 从文件中构造词表与id的映射
        # 将词转换为id,对于袋外词会转换为当前最大id加1. 如词表中最大的id为10,那么所有袋外词的id均为11
        word_ids = vocab_words.lookup(words)

        with tf.variable_scope("embedding"):
            if self.params.use_pretrained:
                glove = np.load(self.params.embed)
                # 将全0向量拼接到glove矩阵的最下边,作为袋外词的向量
                glove = np.vstack([glove, [[0.] * self.params.embed_dim]])
                W = tf.Variable(glove, dtype=tf.float32, trainable=True)
            else:
                W = tf.Variable(tf.random_uniform(
                    [self.params.vocab_size, self.params.embed_dim], -1.0,
                    1.0),
                                name='W',
                                trainable=True)
            embeddings = tf.nn.embedding_lookup(W, word_ids)
            # (batch_size,seq_len,embedding_dim)
            embeddings = tf.layers.dropout(embeddings,
                                           rate=self.params.dropout,
                                           training=self.training)

        outputs = []  # 保存多层BiLSTM的输出结果
        with tf.variable_scope("BiLSTM"):
            outputs.append(self.BiLSTM(embeddings, nwords))
            for i in range(self.params.lstm_layer - 1):
                outputs.append(self.BiLSTM(outputs[-1], nwords))

        with tf.variable_scope("CRF"):
            self.logits, self.pred_ids, crf_params, self.score = self.CRF(
                outputs[-1], self.num_tags, nwords)

        with tf.variable_scope("output"):
            self.probs = tf.nn.softmax(self.logits, axis=-1)
            best_probs = tf.reduce_max(self.probs, axis=-1)
            self.mnlp_score = tf.reduce_mean(tf.log(best_probs), axis=-1)
            reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_tensor(
                self.params.tags)  # 反向词表
            self.pred_strings = reverse_vocab_tags.lookup(
                tf.to_int64(self.pred_ids))  # 将预测的id转换为对应的tag
            self.weights = tf.sequence_mask(nwords)

        with tf.variable_scope("loss"):
            if self.mode != tf.estimator.ModeKeys.PREDICT:
                vocab_tags = tf.contrib.lookup.index_table_from_tensor(
                    self.params.tags)  # tags的词表
                self.tags = vocab_tags.lookup(self.labels)  # 将tags转换为对应的id
                log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
                    self.logits, self.tags, nwords, crf_params)
                self.loss = tf.reduce_mean(-log_likelihood)
                self.train_op = tf.train.AdamOptimizer().minimize(
                    self.loss,
                    global_step=tf.train.get_or_create_global_step())
Exemplo n.º 12
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    x = words.values
    split_chars = tf.string_split(x, delimiter='')
    table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv',
                                         vocab_size=69,
                                         default_value=0)

    dense_words = tf.sparse_tensor_to_dense(split_chars, default_value='#')
    word_ids = table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0],
                              [-1, commons.MAX_DOCUMENT_LENGTH])

    encoded = tf.one_hot(table.lookup(split_chars.values),
                         commons.MAX_DOCUMENT_LENGTH,
                         dtype=tf.float32)
    encoded = tf.reshape(encoded, [commons.MAX_DOCUMENT_LENGTH, 69])
    f1 = tf.keras.layers.Convolution1D(filters=256,
                                       kernel_size=7,
                                       padding="valid",
                                       activation='relu')(word_id_vector)
    f1 = tf.keras.layers.MaxPooling1D(pool_size=3)(f1)
    f1 = tf.keras.layers.Flatten()(f1)
    #f1 = tf.keras.layers.Flatten()(f1)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    labels_one_hot = tf.one_hot(labels, depth=4, dtype=tf.int32)

    #loss = tf.losses.softmax_cross_entropy(onehot_labels=labels_one_hot, logits=logits)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss=loss,
                                  global_step=tf.train.get_global_step())
    print(tf.shape(labels))
    print(tf.shape(prediction_indices))
    eval_metrics_ops = {
        'accuracy':
        tf.metrics.accuracy(labels=labels, predictions=prediction_indices)
    }
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=eval_metrics_ops)
def get_wide_deep():

    MAX_DOCUMENT_LENGTH = 20
    PADWORD = 'xyzpadxyz'
    EMBEDDING_SIZE = 10
    VOCAB_SIZE = 10002

    # Define column types
    subreddit = tf.feature_column.categorical_column_with_vocabulary_list(
        'subreddit', ['news', 'ireland', 'pics'])

    vocab_table = lookup.index_table_from_file(
        vocabulary_file='{}/vocab.csv-00000-of-00001'.format(INPUT_DIR),
        num_oov_buckets=1,
        vocab_size=None,
        default_value=-1)

    # i have tried all these and none work
    #comment_words = tf.string_split(tf.get_variable('comment'))
    #comment_words = tf.string_split(['comment'])
    #comment_words = tf.string_split(tf.constant(['comment']))
    #comment_words = tf.string_split([comment])
    #comment_words = tf.string_split('comment')
    #comment = tf.constant(['comment'])
    #comment = tf.constant(dataset['comment'])
    #comment = tf.constant(comment)
    #comment_words = tf.string_split(features["comment"])
    #comment_words = tf.string_split(dataset["comment"])
    #comment = tf.constant(features.get('comment'))
    #comment = tf.constant(features.get('comment'))

    #comment_words = tf.string_split(features.get("comment"))
    #comment_words = tf.string_split(dataset.get("comment"))

    #comment_densewords = tf.sparse_tensor_to_dense(comment_words, default_value=PADWORD)
    #comment_numbers = vocab_table.lookup(comment_densewords)
    #comment_padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    #comment_padded = tf.pad(comment_numbers, comment_padding)
    #comment_sliced = tf.slice(comment_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])

    #print('comment_sliced={}'.format(comment_words))  # (?, 20)
    #comment_integerized = tf.contrib.layers.sparse_column_with_integerized_feature(comment_sliced, bucket_size=VOCAB_SIZE, combiner='sum')

    #comment_bow = tf.one_hot(comment_sliced)

    #comment_embeds = tf.contrib.layers.embedding_column(comment_integerized, dimension=EMBEDDING_SIZE)
    #print('comment_embeds={}'.format(comment_embeds)) # (?, 20, 10)

    # Sparse columns are wide, have a linear relationship with the output
    wide = [subreddit]

    # Continuous columns are deep, have a complex relationship with the output
    deep = []

    return wide, deep
Exemplo n.º 14
0
    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        vocab_table = contrib_lookup.index_table_from_file(FLAGS.vocab_file)

        if len(expanded_files) == 1:
            d = tf.data.TFRecordDataset(expanded_files[0])
            if is_training:
                d = d.repeat()
                d = d.shuffle(buffer_size=256)
        else:
            dataset_list = [
                tf.data.TFRecordDataset(expanded_files[i])
                for i in range(len(expanded_files))
            ]
            if is_training:
                dataset_list = [d.repeat() for d in dataset_list]
            dset_weights = [
                FLAGS.dataset_one_weight, 1 - FLAGS.dataset_one_weight
            ]
            if FLAGS.dataset_two_weight != 0:
                dset_weights = [
                    FLAGS.dataset_one_weight, FLAGS.dataset_two_weight,
                    1 - FLAGS.dataset_one_weight + FLAGS.dataset_two_weight
                ]
            d = tf.data.experimental.sample_from_datasets(
                dataset_list, dset_weights)

            # Note that sample_from_datasets() inserts randomness into the training
            # An alternative would be to use choose_from_datasets() but then the
            # order must be stated explicitly which is less intitive for unbalanced
            # datasets. Example below:
            #
            # choice_dataset = tf.data.Dataset.range(len(dataset_list)).repeat()
            # d = tf.data.experimental.choose_from_datasets(dataset_list,
            #                                               choice_dataset)

            if is_training:
                d = d.shuffle(buffer_size=256)

        # The window size will be for selecting negative samples
        # It equals the number of documents to sample from -1
        d = d.apply(
            contrib_data.sliding_window_batch(
                window_size=FLAGS.data_window_size,
                window_shift=FLAGS.data_window_shift))
        d = d.apply(
            tf.data.experimental.map_and_batch(lambda record: _decode_record(
                record, name_to_features, vocab_table),
                                               batch_size=batch_size,
                                               drop_remainder=drop_remainder))

        return d
Exemplo n.º 15
0
def load_vocab(infilename):
    v = arguments.pop('vocab', None)
    if v is None:
        return
    print("Loading Vocabulary {0}".format(v))
    table = lookup.index_table_from_file(vocabulary_file=infilename,
                                         num_oov_buckets=1,
                                         vocab_size=None,
                                         default_value=-1)
    numbers = table.lookup(tf.constant('quick fox the not blah blah'.split()))
    with tf.Session() as sess:
        tf.tables_initializer().run()
        print "{} --> {}".format(LINES[0], numbers.eval())
Exemplo n.º 16
0
    def test_stale_asset_collections_are_cleaned(self):
        vocabulary_file = os.path.join(compat.as_bytes(test.get_temp_dir()),
                                       compat.as_bytes('asset'))
        file_io.write_string_to_file(vocabulary_file, 'foo bar baz')

        export_path = os.path.join(tempfile.mkdtemp(), 'export')

        # create a SavedModel including assets
        with tf.Graph().as_default():
            with tf.Session().as_default() as session:
                input_string = tf.placeholder(tf.string)
                # Map string through a table loaded from an asset file
                table = lookup.index_table_from_file(vocabulary_file,
                                                     num_oov_buckets=12,
                                                     default_value=12)
                output = table.lookup(input_string)
                inputs = {'input': input_string}
                outputs = {'output': output}
                saved_transform_io.write_saved_transform_from_session(
                    session, inputs, outputs, export_path)

        # Load it and save it again repeatedly, verifying that the asset collections
        # remain valid.
        for _ in [1, 2, 3]:
            with tf.Graph().as_default() as g:
                with tf.Session().as_default() as session:
                    input_string = tf.constant('dog')
                    inputs = {'input': input_string}
                    _, outputs = (saved_transform_io.
                                  partially_apply_saved_transform_internal(
                                      export_path, inputs))

                    self.assertEqual(
                        1,
                        len(g.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
                    self.assertEqual(
                        0,
                        len(
                            g.get_collection(
                                tf.saved_model.constants.ASSETS_KEY)))

                    # Check that every ASSET_FILEPATHS refers to a Tensor in the graph.
                    # If not, get_tensor_by_name() raises KeyError.
                    for asset_path in g.get_collection(
                            ops.GraphKeys.ASSET_FILEPATHS):
                        tensor_name = asset_path.name
                        g.get_tensor_by_name(tensor_name)

                    export_path = os.path.join(tempfile.mkdtemp(), 'export')
                    saved_transform_io.write_saved_transform_from_session(
                        session, inputs, outputs, export_path)
def cnn_model(features, target, mode):
    table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1)
    
    # string operations
    titles = tf.squeeze(features['title'], [1])
    words = tf.string_split(titles)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table.lookup(densewords)
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    print('words_sliced={}'.format(words))  # (?, 20)

    # layer to take the words and convert them into vectors (embeddings)
    embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE)
    print('words_embed={}'.format(embeds)) # (?, 20, 10)
    
    # now do convolution
    conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1)
    conv = tf.nn.relu(conv) # (?, 4, 1)
    words = tf.squeeze(conv, [2]) # (?, 4)
    print('words_conv={}'.format(words)) # (?, 4)

    n_classes = len(TARGETS)

    logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None)
    #print('logits={}'.format(logits)) # (?, 3)
    predictions_dict = {
      'source': tf.gather(TARGETS, tf.argmax(logits, 1)),
      'class': tf.argmax(logits, 1),
      'prob': tf.nn.softmax(logits)
    }

    if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
       loss = tf.losses.sparse_softmax_cross_entropy(target, logits)
       train_op = tf.contrib.layers.optimize_loss(
         loss,
         tf.contrib.framework.get_global_step(),
         optimizer='Adam',
         learning_rate=0.01)
    else:
       loss = None
       train_op = None

    return tflearn.ModelFnOps(
      mode=mode,
      predictions=predictions_dict,
      loss=loss,
      train_op=train_op)
Exemplo n.º 18
0
def train_input_fn():
    def file_len(fname):
        with open(fname) as f:
            for i, l in enumerate(f):
                pass
        return i + 1

    filename = "train.csv"
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.TextLineReader()
    _, value = reader.read(filename_queue)
    record_defaults = DEFAULTS
    col1, col2 = tf.decode_csv(value,
                               record_defaults=record_defaults,
                               field_delim='|')
    label = tf.stack([col1])
    features = tf.stack([col2])

    table = tf.contrib.lookup.index_table_from_tensor(
        mapping=tf.constant(TARGETS), num_oov_buckets=0, default_value=-1)
    labels = table.lookup(label)

    table2 = lookup.index_table_from_file(vocabulary_file='vocab.tsv',
                                          num_oov_buckets=1,
                                          vocab_size=None,
                                          default_value=-1)

    #look strings up in the vocabulary
    words = tf.string_split(features)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table2.lookup(densewords)

    #pads vectors out to MAX_DOCUMENT_LENGTH
    padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH])
    shaped = tf.reshape(sliced, [1735])

    batch_size = file_len(filename)
    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * batch_size

    features, labels = tf.train.shuffle_batch(
        [shaped, labels],
        batch_size=batch_size,
        capacity=capacity,
        min_after_dequeue=min_after_dequeue)

    return features, labels
Exemplo n.º 19
0
def get_embedding(hparams, titles, embed_size):
    table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1)
    
    # string operations
    words = tf.string_split(titles)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table.lookup(densewords)
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    #print('words_sliced={}'.format(words))  # (?, 20)

    # layer to take the words and convert them into vectors (embeddings)
    embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=embed_size)
    #print('words_embed={}'.format(embeds)) # (?, 20, 10)
    return embeds
Exemplo n.º 20
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)

    filter_sizes = [3, 5]

    conv_pools = []
    for text_embedding in [f1, f2, f3]:
        for filter_size in filter_sizes:
            l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding)
            l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero)
            l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv)
            conv_pools.append(l_pool)
    merged = tf.keras.layers.Concatenate(axis=1)(conv_pools)
    dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged)
    dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1)

    logits = tf.keras.layers.Dense(1, activation=None)(dense2)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

    return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits,
                                      train_op_fn=_train_op_fn)
Exemplo n.º 21
0
def serving_fn():
    input_string = tf.placeholder(dtype=tf.string, shape=None)

    receiver_tensor = {
        'sms_input': input_string
    }
    # word_id_vector = tf.map_fn(fn=map_serving, elems=input_string)
    vocab_table = lookup.index_table_from_file(vocabulary_file='vocab.csv', num_oov_buckets=1, default_value=-1)
    words = tf.string_split(input_string)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, max_words]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_words])
    features = {'sms_input': word_id_vector}

    return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)
Exemplo n.º 22
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    f1 = tf.keras.layers.Embedding(params.N_WORDS, 50, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f1 = tf.keras.layers.Dropout(0.2)(f1)
    f1 = tf.keras.layers.ZeroPadding1D((49, 49))(f1)
    f1 = tf.keras.layers.Conv1D(64, 50, padding='same', activation=None, strides=1)(f1)
    f1 = KMaxPooling(k=9, axis=1)(f1)

    f1 = tf.keras.layers.ZeroPadding1D((24, 24))(f1)
    f1 = tf.keras.layers.Conv1D(64, 25, padding='same', activation=None, strides=1)(f1)
    f1 = Folding()(f1)
    f1 = KMaxPooling(k=9, axis=1)(f1)
    f1 = tf.keras.layers.Activation('relu')(f1)
    f1 = tf.keras.layers.Flatten()(f1)
    f1 = tf.keras.layers.Dropout(0.2)(f1)
    logits = tf.keras.layers.Dense(1, activation=None)(f1)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

    return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits,
                                      train_op_fn=_train_op_fn)
Exemplo n.º 23
0
    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        vocab_table = contrib_lookup.index_table_from_file(FLAGS.vocab_file)

        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=256)

        d = d.apply(
            tf.data.experimental.map_and_batch(lambda record: _decode_record(
                record, name_to_features, vocab_table),
                                               batch_size=batch_size,
                                               drop_remainder=drop_remainder))

        return d
Exemplo n.º 24
0
def serving_fn():
    input_string = tf.placeholder(dtype=tf.string, shape=None)

    receiver_tensor = {'sms_input': input_string}
    # word_id_vector = tf.map_fn(fn=map_serving, elems=input_string)
    vocab_table = lookup.index_table_from_file(vocabulary_file='vocab.csv',
                                               num_oov_buckets=1,
                                               default_value=-1)
    words = tf.string_split(input_string)
    dense_words = tf.sparse_tensor_to_dense(words,
                                            default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, max_words]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_words])
    features = {'sms_input': word_id_vector}

    return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)
def linear_model(features, target, mode):
    # make input features numeric
    from tensorflow.contrib import lookup
    table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE,
                                         num_oov_buckets=1,
                                         vocab_size=N_WORDS,
                                         default_value=-1,
                                         name="word_to_index")
    word_indexes = table.lookup(features['title'])
    word_vectors = tf.contrib.layers.embed_sequence(word_indexes,
                                                    vocab_size=(N_WORDS + 1),
                                                    embed_dim=EMBEDDING_SIZE,
                                                    scope='words')

    n_classes = len(TARGETS)

    logits = tf.contrib.layers.fully_connected(word_vectors,
                                               n_classes,
                                               activation_fn=None)
    logits = tf.squeeze(logits, squeeze_dims=[1])  # from (?,1,3) to (?,3)
    predictions_dict = {
        'source': tf.gather(TARGETS, tf.argmax(logits, 1)),
        'class': tf.argmax(logits, 1),
        'prob': tf.nn.softmax(logits)
    }

    if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
        loss = tf.losses.sparse_softmax_cross_entropy(target, logits)
        train_op = tf.contrib.layers.optimize_loss(
            loss,
            tf.contrib.framework.get_global_step(),
            optimizer='Adam',
            learning_rate=0.01)
    else:
        loss = None
        train_op = None

    return tflearn.ModelFnOps(mode=mode,
                              predictions=predictions_dict,
                              loss=loss,
                              train_op=train_op)
def cnn_model(features, labels, mode):

    # convert vocab to numbers
    table = lookup.index_table_from_file(vocabulary_file='vocab.tsv',
                                         num_oov_buckets=1,
                                         vocab_size=None,
                                         default_value=-1)

    #Looks up specific terms 'Some title'
    # numbers = table.lookup(tf.constant('Some title'.split()))
    # with tf.Session() as sess:
    #   tf.tables_initializer().run()
    #   print("{} --> {}".format(lines[0], numbers.eval()))

    #create sparse vectors, convert to dense and look vectors up in the dictionary
    # titles = tf.squeeze(features['Review Text'], [1])
    words = tf.string_split(features)
    densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
    numbers = table.lookup(densewords)

    #Shows dense word vectors
    # sess = tf.Session()
    #sess.run(densewords)

    #Shows vectors of words where dictionary is applied
    #table.init.run(session=sess)
    #print(numbers.eval(session=sess))

    #pads vectors out to MAX_DOCUMENT_LENGTH
    padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]])
    padded = tf.pad(numbers, padding)
    sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH])
    # sess.run(sliced)

    #create embeddings

    embeds = tf.contrib.layers.embed_sequence(sliced,
                                              vocab_size=N_WORDS,
                                              embed_dim=EMBEDDING_SIZE)
    #print('words_embed={}'.format(embeds)) # (?, 20, 10)

    #Convolutions!!!

    conv = tf.contrib.layers.conv2d(embeds,
                                    1,
                                    WINDOW_SIZE,
                                    stride=STRIDE,
                                    padding='SAME')  # (?, 4, 1)
    conv = tf.nn.relu(conv)  # (?, 4, 1)
    words = tf.squeeze(conv, [2])  # (?, 4)

    logits = tf.contrib.layers.fully_connected(words,
                                               n_classes,
                                               activation_fn=None)

    correctPred = tf.equal(tf.argmax(logits, 1), labels)
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float64))

    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()

    if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL:
        loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)
        train_op = tf.contrib.layers.optimize_loss(
            loss,
            tf.contrib.framework.get_global_step(),
            optimizer='Adam',
            learning_rate=0.01)
    else:
        loss = None
        train_op = None

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Exemplo n.º 27
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    word_embeddings = layers.embed_sequence(word_id_vector, vocab_size=params.N_WORDS, embed_dim=50)

    min_vectors = tf.reduce_min(word_embeddings, axis=1)
    max_vectors = tf.reduce_max(word_embeddings, axis=1)

    min_max_vectors = tf.concat([min_vectors, max_vectors], axis=1)

    d1 = tf.keras.layers.Dense(25, activation='relu')(min_max_vectors)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE)(d1)

    probabilities = tf.nn.softmax(logits)
    predicted_indices = tf.argmax(probabilities, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class': tf.gather(commons.TARGET_LABELS, predicted_indices),
            'probabilities': probabilities
        }

        exported_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs)

    weights = features[commons.WEIGHT_COLUNM_NAME]

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)
    tf.summary.scalar('loss', loss)

    acc = tf.equal(predicted_indices, labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices, weights=weights),
            'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices, weights=weights),
            'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices, weights=weights),
            'f1_score': streaming_f1(labels=labels, predictions=predicted_indices)
        }


        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 28
0
    def fast_text_model_fn(self, features, labels, mode, params):
        vocab_table = lookup.index_table_from_file(
            vocabulary_file=self.VOCAB_FILE,
            num_oov_buckets=1,
            default_value=-1)
        text = features[self.FEATURE_COL]
        words = tf.string_split(text)
        dense_words = tf.sparse_tensor_to_dense(words,
                                                default_value=self.PAD_WORD)
        word_ids = vocab_table.lookup(dense_words)
        padding = tf.constant([[0, 0], [0, self.MAX_LEN]])
        # Pad all the word_ids entries to the maximum document length
        word_ids_padded = tf.pad(word_ids, padding)
        word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN])

        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.keras.backend.set_learning_phase(True)
        else:
            tf.keras.backend.set_learning_phase(False)

        with tf.name_scope('embedding'):
            embedding_vectors = layers.embed_sequence(
                word_id_vector,
                vocab_size=self.VOCAB_LEN,
                embed_dim=self.EMBED_DIM,
                initializer=layers.xavier_initializer(seed=42))
            tf.logging.info('Word Vectors = {}'.format(embedding_vectors))

        with tf.name_scope('fast_text'):
            average_vectors = tf.reduce_sum(embedding_vectors, axis=1)
            tf.logging.info(
                'Average Word Vectors = {}'.format(average_vectors))

        with tf.name_scope('hidden_layer'):
            fc1 = tf.keras.layers.Dense(1024,
                                        activation='relu')(average_vectors)
            d1 = tf.keras.layers.Dropout(0.5)(fc1)
            fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2,
                                        activation='relu')(d1)
            d2 = tf.keras.layers.Dropout(0.5)(fc2)
            tf.logging.info('Hidden Layer = {}'.format(d2))

        with tf.name_scope('output'):
            logits = tf.keras.layers.Dense(self.TARGET_SIZE,
                                           activation=None)(d2)
            tf.logging.info('Logits Layer = {}'.format(logits))

        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, axis=1)

        tf.summary.histogram('fasttext', average_vectors)
        tf.summary.histogram('softmax', probabilities)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                'class': predicted_indices,
                'probabilities': probabilities
            }

            exported_outputs = {
                'prediction': tf.estimator.export.PredictOutput(predictions)
            }
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions,
                                              export_outputs=exported_outputs)

        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)
        tf.summary.scalar('loss', loss)
        acc = tf.equal(predicted_indices, labels)
        acc = tf.reduce_mean(tf.cast(acc, tf.float32))

        tf.summary.scalar('acc', acc)

        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.AdamOptimizer()
            train_op = optimizer.minimize(
                loss=loss, global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics_ops = {
                'accuracy':
                tf.metrics.accuracy(labels=labels,
                                    predictions=predicted_indices),
                'precision':
                tf.metrics.precision(labels=labels,
                                     predictions=predicted_indices),
                'recall':
                tf.metrics.recall(labels=labels,
                                  predictions=predicted_indices),
                'f1_score':
                self.streaming_f1(labels=labels,
                                  predictions=predicted_indices,
                                  n_classes=self.TARGET_SIZE)
            }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metrics_ops)
def main():
    input = [["emersoN", "lAke", "aNd", "palmer"],
             ["i", "haVe", "a", "343yaCht123", "m%an", "2543"]]

    sentences_padded, _ = pad_sequences(input, '')

    sentences = tf.constant(sentences_padded)
    lowercase_sentences = lowercase(sentences)

    table = lookup.index_table_from_tensor(mapping=tf.constant(['']),
                                           default_value=1)

    sequence_lengths = tf.reduce_sum(table.lookup(sentences), 1)

    word_table = lookup.index_table_from_file(vocabulary_file="data/words.txt",
                                              num_oov_buckets=1)

    char_table = lookup.index_table_from_file(vocabulary_file="data/chars.txt",
                                              default_value=-1)

    sentences_shape = tf.shape(sentences, out_type=tf.int64)

    # We need to remove chars not in vocab
    removed_char_sentences = remove_unknown_chars(sentences, char_table)

    split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]),
                                  delimiter="")
    dense_split_words = tf.sparse_tensor_to_dense(split_words,
                                                  default_value='')

    max_word_len = tf.gather_nd(split_words.dense_shape, [1])
    chars_shape = tf.concat([sentences_shape, [max_word_len]], 0)

    chars = tf.reshape(dense_split_words, chars_shape)

    word_lengths = tf.reduce_sum(table.lookup(chars), 2)

    word_ids = word_table.lookup(sentences)
    char_ids = char_table.lookup(chars)

    word_mask = tf.sequence_mask(sequence_lengths)
    word_ids = tf.where(word_mask, word_ids, tf.zeros_like(word_ids))

    char_mask = tf.sequence_mask(word_lengths)
    char_ids = tf.where(char_mask, char_ids, tf.zeros_like(char_ids))

    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter)
    train = CoNLLDataset(config.filename_train, max_iter=config.max_iter)

    batch_size = model.config.batch_size

    # iterate over dataset
    for i, (words, labels) in enumerate(minibatches(train, batch_size)):
        print "Start"

        fd, _ = model.get_feed_dict(words, labels, model.config.lr,
                                    model.config.dropout)

        _, train_loss = model.sess.run([model.train_op, model.loss],
                                       feed_dict=fd)

        print "train loss", train_loss

        metrics = model.run_evaluate(dev)
        msg = " - ".join(
            ["{} {:04.2f}".format(k, v) for k, v in metrics.items()])
        print msg
Exemplo n.º 30
0
    def fast_text_model_fn(self, features, labels, mode, params):
        vocab_table = lookup.index_table_from_file(vocabulary_file=self.VOCAB_FILE, num_oov_buckets=1,
                                                   default_value=-1)
        text = features[self.FEATURE_COL]
        words = tf.string_split(text)
        dense_words = tf.sparse_tensor_to_dense(words, default_value=self.PAD_WORD)
        word_ids = vocab_table.lookup(dense_words)
        padding = tf.constant([[0, 0], [0, self.MAX_LEN]])
        # Pad all the word_ids entries to the maximum document length
        word_ids_padded = tf.pad(word_ids, padding)
        word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN])

        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.keras.backend.set_learning_phase(True)
        else:
            tf.keras.backend.set_learning_phase(False)

        with tf.name_scope('embedding'):
            embedding_vectors = layers.embed_sequence(word_id_vector, vocab_size=self.VOCAB_LEN,
                                                      embed_dim=self.EMBED_DIM,
                                                      initializer=layers.xavier_initializer(seed=42))
            tf.logging.info('Word Vectors = {}'.format(embedding_vectors))

        with tf.name_scope('fast_text'):
            average_vectors = tf.reduce_sum(embedding_vectors, axis=1)
            tf.logging.info('Average Word Vectors = {}'.format(average_vectors))

        with tf.name_scope('hidden_layer'):
            fc1 = tf.keras.layers.Dense(1024, activation='relu')(average_vectors)
            d1 = tf.keras.layers.Dropout(0.5)(fc1)
            fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2, activation='relu')(d1)
            d2 = tf.keras.layers.Dropout(0.5)(fc2)
            tf.logging.info('Hidden Layer = {}'.format(d2))

        with tf.name_scope('output'):
            logits = tf.keras.layers.Dense(self.TARGET_SIZE, activation=None)(d2)
            tf.logging.info('Logits Layer = {}'.format(logits))

        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, axis=1)

        tf.summary.histogram('fasttext', average_vectors)
        tf.summary.histogram('softmax', probabilities)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                'class': predicted_indices,
                'probabilities': probabilities
            }

            exported_outputs = {
                'prediction': tf.estimator.export.PredictOutput(predictions)
            }
            return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs)

        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
        tf.summary.scalar('loss', loss)
        acc = tf.equal(predicted_indices, labels)
        acc = tf.reduce_mean(tf.cast(acc, tf.float32))

        tf.summary.scalar('acc', acc)

        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.AdamOptimizer()
            train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics_ops = {
                'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices),
                'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices),
                'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices),
                'f1_score': self.streaming_f1(labels=labels, predictions=predicted_indices, n_classes=self.TARGET_SIZE)
            }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 31
0
def model_fn(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)

    filter_sizes = [3, 5]

    conv_pools = []
    for text_embedding in [f1, f2, f3]:
        for filter_size in filter_sizes:
            l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding)
            l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero)
            l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv)
            conv_pools.append(l_pool)
    merged = tf.keras.layers.Concatenate(axis=1)(conv_pools)
    dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged)
    dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1)

    logits = tf.keras.layers.Dense(2, activation=None)(dense2)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': prediction_indices,  # tf.gather(commons.TARGET_LABELS, prediction_indices),
            'class_index': prediction_indices,
            'probabilities': predictions
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    tf.summary.scalar('loss', loss)

    acc = tf.equal(tf.cast(prediction_indices, dtype=tf.int64), tf.cast(labels, dtype=tf.int64))
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices)
        }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 32
0
reader = tf.TextLineReader()
_, value = reader.read(filename_queue)
record_defaults = DEFAULTS
col1, col2 = tf.decode_csv(value,
                           record_defaults=record_defaults,
                           field_delim='|')
label = tf.stack([col1])
features = tf.stack([col2])

table = tf.contrib.lookup.index_table_from_tensor(mapping=tf.constant(TARGETS),
                                                  num_oov_buckets=0,
                                                  default_value=-1)
labels = table.lookup(label)

table2 = lookup.index_table_from_file(vocabulary_file='vocab.tsv',
                                      num_oov_buckets=1,
                                      vocab_size=None,
                                      default_value=-1)

#look strings up in the vocabulary
words = tf.string_split(features)
densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
numbers = table2.lookup(densewords)

#pads vectors out to MAX_DOCUMENT_LENGTH
padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]])
padded = tf.pad(numbers, padding)
sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH])
shaped = tf.reshape(sliced, [MAX_DOCUMENT_LENGTH])
# shaped = tf.expand_dims(shaped, -1)

# batch_size = file_len(filename)
Exemplo n.º 33
0
def model_fn(features, labels, mode, params):
    '''
    CNN model based on Yoon Kim

    https://arxiv.org/pdf/1408.5882.pdf
    :param features:
    :param labels:
    :param mode:
    :param params:
    :return:
    '''
    vocab_table = lookup.index_table_from_file(
        vocabulary_file='dataset/vocab.csv',
        num_oov_buckets=1,
        default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words,
                                            default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0],
                              [-1, commons.MAX_DOCUMENT_LENGTH])

    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    embedded_sequences = tf.keras.layers.Embedding(
        params.N_WORDS, 128,
        input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector)
    conv_layer = []
    for filter_size in filter_sizes:
        l_conv = tf.keras.layers.Conv1D(filters=128,
                                        kernel_size=filter_size,
                                        activation='relu')(embedded_sequences)
        l_pool = tf.keras.layers.MaxPooling1D(pool_size=3)(l_conv)
        conv_layer.append(l_pool)

    l_merge = tf.keras.layers.concatenate(conv_layer, axis=1)
    conv = tf.keras.layers.Conv1D(filters=128,
                                  kernel_size=3,
                                  activation='relu')(l_merge)
    pool = tf.keras.layers.MaxPooling1D(pool_size=3)(conv)
    f1 = tf.keras.layers.Dropout(0.5)(pool)
    f1 = tf.keras.layers.Flatten()(f1)
    f1 = tf.keras.layers.Dense(128, activation='relu')(f1)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': prediction_indices,
            'probabilities': predictions
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    weights = features[commons.WEIGHT_COLUNM_NAME]
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                  logits=logits,
                                                  weights=weights)

    tf.summary.scalar('loss', loss)

    acc = tf.equal(prediction_indices, labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss,
                                      global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode,
                                          train_op=train_op,
                                          loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy':
            tf.metrics.accuracy(labels=labels,
                                predictions=prediction_indices,
                                weights=weights),
            'precision':
            tf.metrics.precision(labels=labels,
                                 predictions=prediction_indices,
                                 weights=weights),
            'recall':
            tf.metrics.recall(labels=labels,
                              predictions=prediction_indices,
                              weights=weights),
            'f1_score':
            streaming_f1(labels=labels,
                         predictions=prediction_indices,
                         n_classes=commons.TARGET_SIZE,
                         weights=None)
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metrics_ops)
Exemplo n.º 34
0
def model_fn(features, labels, mode, params):
    '''
    CNN model based on Yoon Kim

    https://arxiv.org/pdf/1408.5882.pdf
    :param features:
    :param labels:
    :param mode:
    :param params:
    :return:
    '''
    vocab_table = lookup.index_table_from_file(vocabulary_file='dataset/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    if mode == tf.estimator.ModeKeys.TRAIN:
        tf.keras.backend.set_learning_phase(True)
    else:
        tf.keras.backend.set_learning_phase(False)

    embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 128, input_length=commons.MAX_DOCUMENT_LENGTH)(
        word_id_vector)
    conv_layer = []
    for filter_size in filter_sizes:
        l_conv = tf.keras.layers.Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = tf.keras.layers.MaxPooling1D(pool_size=3)(l_conv)
        conv_layer.append(l_pool)

    l_merge = tf.keras.layers.concatenate(conv_layer, axis=1)
    conv = tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu')(l_merge)
    pool = tf.keras.layers.MaxPooling1D(pool_size=3)(conv)
    f1 = tf.keras.layers.Dropout(0.5)(pool)
    f1 = tf.keras.layers.Flatten()(f1)
    f1 = tf.keras.layers.Dense(128, activation='relu')(f1)
    logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': prediction_indices,
            'probabilities': predictions
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)

    weights = features[commons.WEIGHT_COLUNM_NAME]
    tf.logging.info('Logits Layer = {}'.format(logits))
    tf.logging.info('Logits Layer = {}'.format(labels))
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)

    tf.summary.scalar('loss', loss)

    acc = tf.equal(prediction_indices, labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices, weights=weights),
            'precision': tf.metrics.precision(labels=labels, predictions=prediction_indices, weights=weights),
            'recall': tf.metrics.recall(labels=labels, predictions=prediction_indices, weights=weights),
            'f1_score': streaming_f1(labels=labels, predictions=prediction_indices, n_classes=commons.TARGET_SIZE,
                                     weights=None)
        }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 35
0
def model_fn(features, labels, mode, params):
    vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1)
    text = features[commons.FEATURE_COL]
    words = tf.string_split(text)
    dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD)
    word_ids = vocab_table.lookup(dense_words)

    padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH])

    word_id_vector = {commons.FEATURE_COL: word_id_vector}

    bow_column = tf.feature_column.categorical_column_with_identity(commons.FEATURE_COL, num_buckets=params.N_WORDS)
    bow_embedding_column = tf.feature_column.embedding_column(bow_column, dimension=50, combiner='sqrtn')
    bow = tf.feature_column.input_layer(word_id_vector, feature_columns=[bow_embedding_column])
    logits = tf.layers.dense(bow, 2, activation=None)

    predictions = tf.nn.softmax(logits)
    prediction_indices = tf.argmax(predictions, axis=1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        prediction_dict = {
            'class': tf.gather(commons.TARGET_LABELS, prediction_indices),
            'probabilities': predictions
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(prediction_dict)
        }

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs)

    weights = features[commons.WEIGHT_COLUNM_NAME]
    print(tf.shape(labels))
    print(tf.shape(logits))
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)

    tf.summary.scalar('loss', loss)

    acc = tf.equal(prediction_indices, labels)
    acc = tf.reduce_mean(tf.cast(acc, tf.float32))

    tf.summary.scalar('acc', acc)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()

        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices, weights=weights),
            'precision': tf.metrics.precision(labels=labels, predictions=prediction_indices, weights=weights),
            'recall': tf.metrics.recall(labels=labels, predictions=prediction_indices, weights=weights),
            'f1_score': streaming_f1(labels=labels, predictions=prediction_indices, n_classes=commons.TARGET_SIZE,
                                     weights=None)
        }
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
Exemplo n.º 36
0
    def _model_fn(self, features, labels, mode, params):
        '''

        :param features: TF Placeholder of type String of shape [BATCH_SIZE, 1]
        :param labels: TF Placeholder of type String of shape [BATCH_SIZE, 1]
        :param mode: ModeKeys
        :param params:
        :return:
        '''

        is_training = mode == ModeKeys.TRAIN

        # [BATCH_SIZE, 1]
        text_features = features[self.FEATURE_1_NAME]
        positional_features = features[self.FEATURE_3_NAME]

        if self.ner_config.USE_CHAR_EMBEDDING:
            # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH]
            char_ids = features[self.FEATURE_2_NAME]

            tf.logging.info('char_ids: =======> {}'.format(char_ids))

            s = tf.shape(char_ids)

            # remove pad words
            char_ids_reshaped = tf.reshape(char_ids,
                                           shape=(s[0] * s[1],
                                                  s[2]))  # 20 -> char dim

        with tf.variable_scope("sentence-words-2-ids"):
            word_table = lookup.index_table_from_file(
                vocabulary_file=self.ner_config.WORDS_VOCAB_FILE,
                num_oov_buckets=0,  # TODO use this for Out of Vocab
                default_value=1,  # id of <UNK>  w.r.t WORD VOCAB
                name="table")
            tf.logging.info('table info: {}'.format(word_table))

            # [BATCH_SIZE, 1]
            words = tf.string_split(text_features, delimiter=SEPERATOR)

            # [BATCH_SIZE, ?] i.e [BATCH_SIZE, VARIABLE_SEQ_LENGTH]
            densewords = tf.sparse_tensor_to_dense(
                words,
                default_value=self.ner_config.PAD_WORD)  # TODO add test case

            # [BATCH_SIZE, ?] i.e [BATCH_SIZE, MAX_SEQ_LENGTH]
            token_ids = word_table.lookup(
                densewords)  # TODO check is it variable length or not?

            tf.logging.info('token_ids_shape: ------> {}'.format(
                token_ids.shape[1]))
            tf.logging.info('densewords_shape: ------> {}'.format(
                densewords.shape))
            tf.logging.info(
                "positional_shape: ---->{}".format(positional_features))

        with tf.variable_scope("ner-tags-2-ids"):
            if mode != ModeKeys.INFER:
                ner_table = lookup.index_table_from_file(
                    vocabulary_file=self.ner_config.TAGS_VOCAB_FILE,
                    num_oov_buckets=0,
                    default_value=0,  # id of <UNK> w.r.t ENTITY VOCAB
                    name="table")

                tf.logging.info('ner_table info: {}'.format(ner_table))

                # [BATCH_SIZE, 1]
                labels_splitted = tf.string_split(labels, delimiter=SEPERATOR)
                # [BATCH_SIZE, ?] i.e [BATCH_SIZE, VARIABLE_SEQ_LENGTH]
                labels_splitted_dense = tf.sparse_tensor_to_dense(
                    labels_splitted, default_value="O")
                # [BATCH_SIZE, ?] i.e [BATCH_SIZE, MAX_SEQ_LENGTH]
                ner_ids = ner_table.lookup(labels_splitted_dense)
                ner_ids = tf.cast(ner_ids, tf.int32)

                tf.logging.info("ner_ids: {}".format(ner_ids))

        with tf.variable_scope("word-embed-layer"):
            # layer to take the words and convert them into vectors (embeddings)
            # This creates embeddings matrix of [VOCAB_SIZE, EMBEDDING_SIZE] and then
            # maps word indexes of the sequence into
            # [BATCH_SIZE, MAX_SEQ_LENGTH] --->  [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE].
            word_embeddings = tf.contrib.layers.embed_sequence(
                token_ids,
                vocab_size=self.ner_config.VOCAB_SIZE,
                embed_dim=self.ner_config.WORD_EMBEDDING_SIZE,
                initializer=tf.contrib.layers.xavier_initializer(seed=42))
            tf.logging.info('positional_features_length =====> {}'.format(
                positional_features.shape))

            tf.logging.info('word_embeddings_shape: ------> {}'.format(
                word_embeddings.shape))

            # word_embeddings = tf.concat([ word_embeddings, positional_features], axis=-1)

            tf.logging.info(
                'word_embeddings: ------> {}'.format(word_embeddings))

            word_embeddings = tf.layers.dropout(
                word_embeddings,
                rate=self.ner_config.KEEP_PROP,
                seed=42,
                training=mode == tf.estimator.ModeKeys.TRAIN)

            # [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE]
            tf.logging.info(
                'word_embeddings =====> {}'.format(word_embeddings))

            # seq_length = get_sequence_length_old(word_embeddings) TODO working
            # [BATCH_SIZE, ]
            seq_length = get_sequence_length(token_ids)

            tf.logging.info('seq_length =====> {}'.format(seq_length))

        with tf.variable_scope("char_embed_layer"):
            if self.ner_config.USE_CHAR_EMBEDDING:
                print_error((self.ner_config.CHAR_VOCAB_SIZE,
                             self.ner_config.CHAR_EMBEDDING_SIZE))
                char_embeddings = tf.contrib.layers.embed_sequence(
                    char_ids,
                    vocab_size=self.ner_config.CHAR_VOCAB_SIZE,
                    embed_dim=self.ner_config.CHAR_EMBEDDING_SIZE,
                    initializer=tf.contrib.layers.xavier_initializer(seed=42))

                # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]
                char_embeddings = tf.layers.dropout(
                    char_embeddings,
                    rate=self.ner_config.KEEP_PROP,
                    seed=42,
                    training=mode ==
                    tf.estimator.ModeKeys.TRAIN)  # TODO add test case

                tf.logging.info(
                    'char_embeddings =====> {}'.format(char_embeddings))

        with tf.variable_scope("chars_level_bilstm_layer"):
            if self.ner_config.USE_CHAR_EMBEDDING:
                # put the time dimension on axis=1
                shape = tf.shape(char_embeddings)

                BATCH_SIZE = shape[0]
                MAX_DOC_LENGTH = shape[1]
                CHAR_MAX_LENGTH = shape[2]

                TOTAL_DOCS_LENGTH = tf.reduce_sum(seq_length)

                # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]  ===>
                #      [BATCH_SIZE * MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]
                char_embeddings = tf.reshape(
                    char_embeddings,
                    shape=[
                        BATCH_SIZE * MAX_DOC_LENGTH, CHAR_MAX_LENGTH,
                        self.ner_config.CHAR_EMBEDDING_SIZE
                    ],
                    name="reduce_dimension_1")

                tf.logging.info('reshaped char_embeddings =====> {}'.format(
                    char_embeddings))

                # word_lengths = get_sequence_length_old(char_embeddings) TODO working
                word_lengths = get_sequence_length(char_ids_reshaped)

                tf.logging.info('word_lengths =====> {}'.format(word_lengths))

                # bi lstm on chars
                cell_fw = tf.contrib.rnn.LSTMCell(
                    self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE,
                    state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(
                    self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE,
                    state_is_tuple=True)

                _output = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    dtype=tf.float32,
                    sequence_length=word_lengths,
                    inputs=char_embeddings,
                    scope="encode_words")

                # read and concat output
                _, ((_, output_fw), (_, output_bw)) = _output
                encoded_words = tf.concat([output_fw, output_bw], axis=-1)

                # [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE]
                encoded_words = tf.reshape(
                    encoded_words,
                    shape=[
                        BATCH_SIZE, MAX_DOC_LENGTH,
                        2 * self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE
                    ])

                tf.logging.info(
                    'encoded_words =====> {}'.format(encoded_words))

        with tf.variable_scope("word_level_lstm_layer"):
            # Create a LSTM Unit cell with hidden size of EMBEDDING_SIZE.
            d_rnn_cell_fw_one = tf.nn.rnn_cell.LSTMCell(
                self.ner_config.WORD_LEVEL_LSTM_HIDDEN_SIZE,
                state_is_tuple=True)
            d_rnn_cell_bw_one = tf.nn.rnn_cell.LSTMCell(
                self.ner_config.WORD_LEVEL_LSTM_HIDDEN_SIZE,
                state_is_tuple=True)

            if is_training:
                d_rnn_cell_fw_one = tf.contrib.rnn.DropoutWrapper(
                    d_rnn_cell_fw_one,
                    output_keep_prob=self.ner_config.KEEP_PROP)
                d_rnn_cell_bw_one = tf.contrib.rnn.DropoutWrapper(
                    d_rnn_cell_bw_one,
                    output_keep_prob=self.ner_config.KEEP_PROP)
            else:
                d_rnn_cell_fw_one = tf.contrib.rnn.DropoutWrapper(
                    d_rnn_cell_fw_one, output_keep_prob=1.0)
                d_rnn_cell_bw_one = tf.contrib.rnn.DropoutWrapper(
                    d_rnn_cell_bw_one, output_keep_prob=1.0)

            d_rnn_cell_fw_one = tf.nn.rnn_cell.MultiRNNCell(
                cells=[d_rnn_cell_fw_one] * self.ner_config.NUM_LSTM_LAYERS,
                state_is_tuple=True)
            d_rnn_cell_bw_one = tf.nn.rnn_cell.MultiRNNCell(
                cells=[d_rnn_cell_bw_one] * self.ner_config.NUM_LSTM_LAYERS,
                state_is_tuple=True)

            (fw_output_one,
             bw_output_one), _ = tf.nn.bidirectional_dynamic_rnn(
                 cell_fw=d_rnn_cell_fw_one,
                 cell_bw=d_rnn_cell_bw_one,
                 dtype=tf.float32,
                 sequence_length=seq_length,
                 inputs=word_embeddings,
                 scope="encod_sentence")

            # [BATCH_SIZE, MAX_SEQ_LENGTH, 2*WORD_LEVEL_LSTM_HIDDEN_SIZE) TODO check MAX_SEQ_LENGTH?
            encoded_sentence = tf.concat([fw_output_one, bw_output_one],
                                         axis=-1)

            tf.logging.info(
                'encoded_sentence =====> {}'.format(encoded_sentence))

        #================================================================================================
        with tf.variable_scope("positional_lstm_layer"):

            positional_features = tf.layers.batch_normalization(
                positional_features)

            tf.logging.info(
                'positional_features =====> {}'.format(positional_features))
            tf.logging.info('encoded_words =====> {}'.format(encoded_words))
            tf.logging.info(
                'encoded_sentence =====> {}'.format(encoded_sentence))

        with tf.variable_scope("char_word_embeddings-mergeing_layer"):
            if self.ner_config.USE_CHAR_EMBEDDING:
                encoded_doc = tf.concat(
                    [encoded_words, encoded_sentence, positional_features],
                    axis=-1)
            else:
                encoded_doc = tf.concat(
                    [encoded_sentence, positional_features], axis=-1)

            # [BATCH_SIZE, MAX_SEQ_LENGTH, 2*WORD_LEVEL_LSTM_HIDDEN_SIZE + 2*CHAR_LEVEL_LSTM_HIDDEN_SIZE]
            encoded_doc = tf.layers.dropout(
                encoded_doc,
                rate=self.ner_config.KEEP_PROP,
                seed=42,
                training=mode == tf.estimator.ModeKeys.TRAIN)

            tf.logging.info('encoded_doc: =====> {}'.format(encoded_doc))

        with tf.variable_scope("projection"):

            logits = tf.layers.dense(encoded_doc,
                                     self.ner_config.NUM_TAGS,
                                     name="logit_predictions")

            tf.logging.info("logits: {}".format(logits))

        with tf.variable_scope("loss-layer"):
            """Defines the loss"""

            if mode == ModeKeys.INFER:
                ner_ids = tf.placeholder(
                    tf.int32, shape=[None, None],
                    name="labels")  # no labels during prediction
            else:
                ner_ids = ner_ids

            if True:  # self.config.use_crf:
                log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
                    logits, ner_ids, seq_length)

                tf.logging.info(
                    "log_likelihood:  =====> {}".format(log_likelihood))

                # [NUM_TAGS, NUM_TAGS]
                trans_params = trans_params  # need to evaluate it for decoding
                tf.logging.info("trans_params: =====> {}".format(trans_params))
                ner_crf_loss = tf.reduce_mean(-log_likelihood)

                tf.summary.scalar("loss", ner_crf_loss)
            else:
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=ner_ids)
                mask = tf.sequence_mask(seq_length)
                losses = tf.boolean_mask(losses, mask)
                ner_crf_loss = tf.reduce_mean(losses)
                tf.summary.scalar("loss", ner_crf_loss)

            viterbi_seq, best_score = tf.contrib.crf.crf_decode(
                logits, trans_params, seq_length)

            tf.logging.info("viterbi_seq: {}".format(viterbi_seq))

            predictions = {
                "classes":
                tf.cast(tf.argmax(logits, axis=-1), tf.int32),
                # [BATCH_SIZE, SEQ_LEN]
                "viterbi_seq":
                viterbi_seq,
                # [BATCH_SIZE]
                "confidence":
                tf.reduce_max(tf.nn.softmax(logits, dim=-1), axis=-1),
                "top_3_indices":
                tf.nn.top_k(tf.nn.softmax(logits, dim=-1), k=3).indices,
                "top_3_confidence":
                tf.nn.top_k(tf.nn.softmax(logits, dim=-1), k=3).values
            }

        # Loss, training and eval operations are not needed during inference.
        loss = None
        train_op = None
        eval_metric_ops = {}

        if mode != ModeKeys.INFER:
            train_op = tf.contrib.layers.optimize_loss(
                loss=ner_crf_loss,
                global_step=tf.train.get_global_step(),
                optimizer=tf.train.AdamOptimizer,
                learning_rate=self.ner_config.LEARNING_RATE)

            loss = ner_crf_loss

            eval_metric_ops = {
                'Accuracy':
                tf.metrics.accuracy(labels=ner_ids,
                                    predictions=predictions["viterbi_seq"],
                                    name='accuracy'),
                'Precision':
                tf.metrics.precision(labels=ner_ids,
                                     predictions=predictions["viterbi_seq"],
                                     name='Precision'),
                'Recall':
                tf.metrics.recall(labels=ner_ids,
                                  predictions=predictions["viterbi_seq"],
                                  name='Recall')
            }

        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            loss=loss,
            train_op=train_op,
            eval_metric_ops=eval_metric_ops,
            # training_hooks=self.hooks
        )
Exemplo n.º 37
0
def export():
    checkpoint_path = FLAGS.checkpoint_path
    export_path = FLAGS.export_path
    vocab_path = FLAGS.vocab_path

    num_steps = FLAGS.num_steps
    vocab_size = FLAGS.vocab_size
    embedding_size = FLAGS.embedding_size
    hidden_size = FLAGS.hidden_size
    keep_prob = FLAGS.keep_prob
    num_layers = FLAGS.num_layers
    num_classes = FLAGS.num_classes
    prop_limit = FLAGS.prop_limit

    # split 1-D String dense Tensor to words SparseTensor
    sentences = tf.placeholder(dtype=tf.string,
                               shape=[None],
                               name='input_sentences')
    sparse_words = tf.string_split(sentences, delimiter=' ')

    # slice SparseTensor
    valid_indices = tf.less(sparse_words.indices,
                            tf.constant([num_steps], dtype=tf.int64))
    valid_indices = tf.reshape(
        tf.split(valid_indices, [1, 1], axis=1)[1], [-1])
    valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices)

    excess_indices = tf.greater_equal(sparse_words.indices,
                                      tf.constant([num_steps], dtype=tf.int64))
    excess_indices = tf.reshape(
        tf.split(excess_indices, [1, 1], axis=1)[1], [-1])
    excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices)

    # sparse to dense
    words = tf.sparse_to_dense(
        sparse_indices=valid_sparse_words.indices,
        output_shape=[valid_sparse_words.dense_shape[0], num_steps],
        sparse_values=valid_sparse_words.values,
        default_value='_PAD')

    # dict words to token ids
    words_table = lookup.index_table_from_file(os.path.join(
        vocab_path, 'words_vocab.txt'),
                                               default_value=3)
    words_ids = words_table.lookup(words)

    # blstm model predict
    with tf.variable_scope('model', reuse=None):
        logits, _ = ner_model.inference(words_ids,
                                        valid_sparse_words.dense_shape[0],
                                        num_steps,
                                        vocab_size,
                                        embedding_size,
                                        hidden_size,
                                        keep_prob,
                                        num_layers,
                                        num_classes,
                                        is_training=False)
    props = tf.nn.softmax(logits)
    max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1)

    predict_scores = tf.reshape(max_prop_values, shape=[-1, num_steps])
    predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, num_steps])
    predict_labels_ids = tf.to_int64(predict_labels_ids)

    # replace untrusted prop that less than prop_limit
    trusted_prop_flag = tf.greater_equal(
        predict_scores, tf.constant(prop_limit, dtype=tf.float32))
    replace_prop_labels_ids = tf.to_int64(
        tf.fill(tf.shape(predict_labels_ids), 4))
    predict_labels_ids = tf.where(trusted_prop_flag, predict_labels_ids,
                                  replace_prop_labels_ids)

    # dict token ids to labels
    labels_table = lookup.index_to_string_table_from_file(os.path.join(
        vocab_path, 'labels_vocab.txt'),
                                                          default_value='o')
    predict_labels = labels_table.lookup(predict_labels_ids)

    # extract real blstm predict label in dense and save to sparse
    valid_sparse_predict_labels = tf.SparseTensor(
        indices=valid_sparse_words.indices,
        values=tf.gather_nd(predict_labels, valid_sparse_words.indices),
        dense_shape=valid_sparse_words.dense_shape)

    # create excess label SparseTensor with 'O'
    excess_sparse_predict_labels = tf.SparseTensor(
        indices=excess_sparse_words.indices,
        values=tf.fill(tf.shape(excess_sparse_words.values), 'O'),
        dense_shape=excess_sparse_words.dense_shape)

    # concat SparseTensor
    sparse_predict_labels = tf.SparseTensor(
        indices=tf.concat(axis=0,
                          values=[
                              valid_sparse_predict_labels.indices,
                              excess_sparse_predict_labels.indices
                          ]),
        values=tf.concat(axis=0,
                         values=[
                             valid_sparse_predict_labels.values,
                             excess_sparse_predict_labels.values
                         ]),
        dense_shape=excess_sparse_predict_labels.dense_shape)
    sparse_predict_labels = tf.sparse_reorder(sparse_predict_labels)

    # join SparseTensor to 1-D String dense Tensor
    # remain issue, num_split should equal the real size, but here limit to 1
    join_labels_list = []
    slice_labels_list = tf.sparse_split(sp_input=sparse_predict_labels,
                                        num_split=1,
                                        axis=0)
    for slice_labels in slice_labels_list:
        slice_labels = slice_labels.values
        join_labels = tf.reduce_join(slice_labels,
                                     reduction_indices=0,
                                     separator=' ')
        join_labels_list.append(join_labels)
    format_predict_labels = tf.stack(join_labels_list)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            print('read model from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            global_step = int(ckpt.model_checkpoint_path.split('-')[-1])
        else:
            print('No checkpoint file found at %s' % FLAGS.checkpoint_path)
            return

        # Export inference model.
        output_path = os.path.join(export_path, str(global_step))
        print 'Exporting trained model to', output_path
        builder = tf.saved_model.builder.SavedModelBuilder(output_path)

        # Build the signature_def_map.
        predict_inputs_tensor_info = tf.saved_model.utils.build_tensor_info(
            sentences)
        predict_output_tensor_info = tf.saved_model.utils.build_tensor_info(
            format_predict_labels)
        prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(
            inputs={
                'input_sentences': predict_inputs_tensor_info,
            },
            outputs={'classes': predict_output_tensor_info},
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)

        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')
        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={'predict_ner': prediction_signature},
            legacy_init_op=legacy_init_op)

        builder.save()
        print 'Successfully exported model to %s' % export_path
Exemplo n.º 38
0
def apply_vocabulary(x,
                     deferred_vocab_filename_tensor,
                     default_value=-1,
                     num_oov_buckets=0,
                     lookup_fn=None,
                     name=None):
    r"""Maps `x` to a vocabulary specified by the deferred tensor.

  This function also writes domain statistics about the vocabulary min and max
  values. Note that the min and max are inclusive, and depend on the vocab size,
  num_oov_buckets and default_value.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary
      transformation should be applied.
      The column names are those intended for the transformed tensors.
    deferred_vocab_filename_tensor: The deferred vocab filename tensor as
      returned by `tft.vocabulary`.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    lookup_fn: Optional lookup function, if specified it should take a
      tensor and a deferred vocab filename as an input and return a lookup `op`
      along with the table size, by default `apply_vocab` performs a
      lookup.string_to_index_table_from_file for the table lookup.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer. Each unique string value that appears in the vocabulary
    is mapped to a different integer and integers are consecutive
    starting from zero, and string value not in the vocabulary is
    assigned default_value.
  """
    with tf.name_scope(name, 'apply_vocab'):
        if lookup_fn:
            result, table_size = lookup_fn(x, deferred_vocab_filename_tensor)
        else:
            table = lookup.index_table_from_file(
                deferred_vocab_filename_tensor,
                num_oov_buckets=num_oov_buckets,
                default_value=default_value)
            table_size = table.size()
            result = table.lookup(x)

        # Specify schema overrides which will override the values in the schema
        # with the min and max values, which are deferred as they are only known
        # once the analyzer has run.
        #
        # `table_size` includes the num oov buckets.  The default value is only used
        # if num_oov_buckets <= 0.
        min_value = tf.constant(0, tf.int64)
        max_value = table_size - 1
        if num_oov_buckets <= 0:
            min_value = tf.minimum(min_value, default_value)
            max_value = tf.maximum(max_value, default_value)
        schema_inference.set_tensor_schema_override(
            result.values if isinstance(result, tf.SparseTensor) else result,
            min_value, max_value)

        return result