def __init__(self, user_interactive_items, user_list_file, item_list_file, sess): self.user_items = user_interactive_items self.item_table = index_table_from_file(item_list_file) self.user_table = index_table_from_file(user_list_file) sess.run(tf.tables_initializer()) self.generate_sparse_tensor_table(sess)
def create_vocab_tables(src_vocab_file, tgt_vocab_file): src_vocab_table = index_table_from_file( src_vocab_file, default_value=UNK_ID ) tgt_vocab_table = index_table_from_file( tgt_vocab_file, default_value=UNK_ID ) return src_vocab_table, tgt_vocab_table
def __init__(self, tensor_key, tags_id_lookup_file, num_classes, dtype=tf.float32, shape_keys=None, shape=None, default_value='', tags_key_column_index=None, tags_value_column_index=None, delimiter=None): """Initializes the OneHotLabelTensor handler, which decode label text into one-hot encodings.""" lookup_kwargs = {} if tags_key_column_index is not None: lookup_kwargs['key_column_index'] = tags_key_column_index if tags_value_column_index is not None: lookup_kwargs['value_column_index'] = tags_value_column_index table = lookup.index_table_from_file(tags_id_lookup_file, **lookup_kwargs) self._table = table self._delimiter = delimiter or TAG_TEXT_DELIMITER self._num_classes = num_classes self._dtype = dtype super(OneHotLabelTensor, self).__init__(tensor_key, shape_keys, shape, default_value)
def _apply_vocab(y, deferred_vocab_filename_tensor): table = lookup.index_table_from_file( deferred_vocab_filename_tensor, num_oov_buckets=num_oov_buckets, default_value=default_value) table_size = table.size() return table.lookup(y), table_size
def __init__(self, config): self.config = config self.vocabulary = lookup.index_table_from_file(self.config.vocabulary_file, num_oov_buckets=0, default_value=0) self.pad_id = self.vocabulary.lookup(tf.constant(self.config.pad_sign)) self.sentence_data = self.load_dataset_from_text(self.config.sentence_data_file_path) self.label_data = self.load_dataset_from_text(self.config.label_data_file_path) self.input_fn(self.sentence_data, self.label_data)
def linear_model(features, target, mode): # make input features numeric from tensorflow.contrib import lookup table = lookup.index_table_from_file( vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, vocab_size=N_WORDS, default_value=-1, name="word_to_index") titles = tf.squeeze(features['title'], [1]) words = tf.string_split(titles) words = tf.sparse_tensor_to_dense(words, default_value='ZYXW') words = table.lookup(words) print('lookup_words={}'.format(words)) # each row has variable length of words # take the first MAX_DOCUMENT_LENGTH words (pad shorter titles to this) padding = tf.stack([tf.zeros_like(titles,dtype=tf.int64),tf.ones_like(titles,dtype=tf.int64)*MAX_DOCUMENT_LENGTH]) words = tf.pad(words, padding) words = tf.slice(words, [0,0], [-1,MAX_DOCUMENT_LENGTH]) print('words_sliced={}'.format(words)) # (?, 20) # embed the words in a common way words = tf.contrib.layers.embed_sequence( words, vocab_size=(N_WORDS+1), embed_dim=EMBEDDING_SIZE, scope='words') print('words_embed={}'.format(words)) # (?, 20, 10) # now do convolution conv = tf.contrib.layers.convolution2d( words, 5, [3, EMBEDDING_SIZE] , padding='VALID') conv = tf.nn.relu(conv1) words = tf.nn.max_pool(conv, ksize=[1, POOLING_WINDOW, 1, 1], strides=[1, POOLING_STRIDE, 1, 1], padding='SAME') print('words_conv={}'.format(words)) # n_classes = len(TARGETS) logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None) print('logits={}'.format(logits)) logits = tf.squeeze(logits, squeeze_dims=[1]) # from (?,1,3) to (?,3) predictions_dict = { 'source': tf.gather(TARGETS, tf.argmax(logits, 1)), 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) } if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(target, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tflearn.ModelFnOps( mode=mode, predictions=predictions_dict, loss=loss, train_op=train_op)
def __init__(self, batch_size, vocabularies, embedding_dim, num_oov_buckets=1000, fine_tune_embeddings=False, padded_token=None, name='embed_and_pad'): super(EmbedAndPad, self).__init__(name=name) self._batch_size = batch_size vocab_file, vocab_size = get_merged_vocabulary_file( vocabularies, padded_token) self._vocab_size = vocab_size self._num_oov_buckets = num_oov_buckets # Load vocabulary table for index lookup. self._vocabulary_table = contrib_lookup.index_table_from_file( vocabulary_file=vocab_file, num_oov_buckets=num_oov_buckets, vocab_size=self._vocab_size) def create_initializer(initializer_range=0.02): """Creates a `truncated_normal_initializer` with the given range.""" # The default value is chosen from language/bert/modeling.py. return tf.truncated_normal_initializer(stddev=initializer_range) self._embeddings = tf.get_variable( 'embeddings_matrix', [self._vocab_size + num_oov_buckets, embedding_dim], trainable=fine_tune_embeddings, initializer=create_initializer())
def get_lookup_table(element_file_path, oov_buckets, size=None, device='/cpu:0', name='lookup_table'): with tf.device(device): return lookup.index_table_from_file(vocabulary_file=element_file_path, num_oov_buckets=oov_buckets, vocab_size=size, default_value=-1, # -1 is always the padding value name=name)
def custom_fast_text(features, labels, mode, params): vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.CNN_MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.CNN_MAX_DOCUMENT_LENGTH]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 20, input_length=commons.CNN_MAX_DOCUMENT_LENGTH)( word_id_vector) f1 = tf.keras.layers.GlobalMaxPooling1D()(embedded_sequences) logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1) predictions = tf.nn.sigmoid(logits) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': tf.cast(tf.map_fn(lambda x: tf.cond(x > 0.30, lambda: 1.0, lambda: 0.0), tf.squeeze(predictions)), dtype=tf.int32), } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(tf.cast(predictions, dtype=tf.int32), labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def cnn_model(features, target, mode): table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1) # string operations titles = tf.squeeze(features['title'], [1]) words = tf.string_split(titles) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table.lookup(densewords) padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH]) print('words_sliced={}'.format(words)) # (?, 20) # layer to take the words and convert them into vectors (embeddings) embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE) print('words_embed={}'.format(embeds)) # (?, 20, 10) # now do convolution conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1) conv = tf.nn.relu(conv) # (?, 4, 1) words = tf.squeeze(conv, [2]) # (?, 4) print('words_conv={}'.format(words)) # (?, 4) n_classes = len(TARGETS) logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None) #print('logits={}'.format(logits)) # (?, 3) predictions_dict = { 'source': tf.gather(TARGETS, tf.argmax(logits, 1)), 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) } if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(target, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tflearn.ModelFnOps(mode=mode, predictions=predictions_dict, loss=loss, train_op=train_op)
def _build(self): words, nwords = self.inp # words:输入的文本, nwords:输入文本的长度 vocab_words = index_table_from_file( self.params.vocab, num_oov_buckets=self.params.num_oov_buckets) # 从文件中构造词表与id的映射 # 将词转换为id,对于袋外词会转换为当前最大id加1. 如词表中最大的id为10,那么所有袋外词的id均为11 word_ids = vocab_words.lookup(words) with tf.variable_scope("embedding"): if self.params.use_pretrained: glove = np.load(self.params.embed) # 将全0向量拼接到glove矩阵的最下边,作为袋外词的向量 glove = np.vstack([glove, [[0.] * self.params.embed_dim]]) W = tf.Variable(glove, dtype=tf.float32, trainable=True) else: W = tf.Variable(tf.random_uniform( [self.params.vocab_size, self.params.embed_dim], -1.0, 1.0), name='W', trainable=True) embeddings = tf.nn.embedding_lookup(W, word_ids) # (batch_size,seq_len,embedding_dim) embeddings = tf.layers.dropout(embeddings, rate=self.params.dropout, training=self.training) outputs = [] # 保存多层BiLSTM的输出结果 with tf.variable_scope("BiLSTM"): outputs.append(self.BiLSTM(embeddings, nwords)) for i in range(self.params.lstm_layer - 1): outputs.append(self.BiLSTM(outputs[-1], nwords)) with tf.variable_scope("CRF"): self.logits, self.pred_ids, crf_params, self.score = self.CRF( outputs[-1], self.num_tags, nwords) with tf.variable_scope("output"): self.probs = tf.nn.softmax(self.logits, axis=-1) best_probs = tf.reduce_max(self.probs, axis=-1) self.mnlp_score = tf.reduce_mean(tf.log(best_probs), axis=-1) reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_tensor( self.params.tags) # 反向词表 self.pred_strings = reverse_vocab_tags.lookup( tf.to_int64(self.pred_ids)) # 将预测的id转换为对应的tag self.weights = tf.sequence_mask(nwords) with tf.variable_scope("loss"): if self.mode != tf.estimator.ModeKeys.PREDICT: vocab_tags = tf.contrib.lookup.index_table_from_tensor( self.params.tags) # tags的词表 self.tags = vocab_tags.lookup(self.labels) # 将tags转换为对应的id log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( self.logits, self.tags, nwords, crf_params) self.loss = tf.reduce_mean(-log_likelihood) self.train_op = tf.train.AdamOptimizer().minimize( self.loss, global_step=tf.train.get_or_create_global_step())
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) text = features[commons.FEATURE_COL] words = tf.string_split(text) x = words.values split_chars = tf.string_split(x, delimiter='') table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', vocab_size=69, default_value=0) dense_words = tf.sparse_tensor_to_dense(split_chars, default_value='#') word_ids = table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) encoded = tf.one_hot(table.lookup(split_chars.values), commons.MAX_DOCUMENT_LENGTH, dtype=tf.float32) encoded = tf.reshape(encoded, [commons.MAX_DOCUMENT_LENGTH, 69]) f1 = tf.keras.layers.Convolution1D(filters=256, kernel_size=7, padding="valid", activation='relu')(word_id_vector) f1 = tf.keras.layers.MaxPooling1D(pool_size=3)(f1) f1 = tf.keras.layers.Flatten()(f1) #f1 = tf.keras.layers.Flatten()(f1) logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) labels_one_hot = tf.one_hot(labels, depth=4, dtype=tf.int32) #loss = tf.losses.softmax_cross_entropy(onehot_labels=labels_one_hot, logits=logits) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) print(tf.shape(labels)) print(tf.shape(prediction_indices)) eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metrics_ops)
def get_wide_deep(): MAX_DOCUMENT_LENGTH = 20 PADWORD = 'xyzpadxyz' EMBEDDING_SIZE = 10 VOCAB_SIZE = 10002 # Define column types subreddit = tf.feature_column.categorical_column_with_vocabulary_list( 'subreddit', ['news', 'ireland', 'pics']) vocab_table = lookup.index_table_from_file( vocabulary_file='{}/vocab.csv-00000-of-00001'.format(INPUT_DIR), num_oov_buckets=1, vocab_size=None, default_value=-1) # i have tried all these and none work #comment_words = tf.string_split(tf.get_variable('comment')) #comment_words = tf.string_split(['comment']) #comment_words = tf.string_split(tf.constant(['comment'])) #comment_words = tf.string_split([comment]) #comment_words = tf.string_split('comment') #comment = tf.constant(['comment']) #comment = tf.constant(dataset['comment']) #comment = tf.constant(comment) #comment_words = tf.string_split(features["comment"]) #comment_words = tf.string_split(dataset["comment"]) #comment = tf.constant(features.get('comment')) #comment = tf.constant(features.get('comment')) #comment_words = tf.string_split(features.get("comment")) #comment_words = tf.string_split(dataset.get("comment")) #comment_densewords = tf.sparse_tensor_to_dense(comment_words, default_value=PADWORD) #comment_numbers = vocab_table.lookup(comment_densewords) #comment_padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]]) #comment_padded = tf.pad(comment_numbers, comment_padding) #comment_sliced = tf.slice(comment_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH]) #print('comment_sliced={}'.format(comment_words)) # (?, 20) #comment_integerized = tf.contrib.layers.sparse_column_with_integerized_feature(comment_sliced, bucket_size=VOCAB_SIZE, combiner='sum') #comment_bow = tf.one_hot(comment_sliced) #comment_embeds = tf.contrib.layers.embedding_column(comment_integerized, dimension=EMBEDDING_SIZE) #print('comment_embeds={}'.format(comment_embeds)) # (?, 20, 10) # Sparse columns are wide, have a linear relationship with the output wide = [subreddit] # Continuous columns are deep, have a complex relationship with the output deep = [] return wide, deep
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] vocab_table = contrib_lookup.index_table_from_file(FLAGS.vocab_file) if len(expanded_files) == 1: d = tf.data.TFRecordDataset(expanded_files[0]) if is_training: d = d.repeat() d = d.shuffle(buffer_size=256) else: dataset_list = [ tf.data.TFRecordDataset(expanded_files[i]) for i in range(len(expanded_files)) ] if is_training: dataset_list = [d.repeat() for d in dataset_list] dset_weights = [ FLAGS.dataset_one_weight, 1 - FLAGS.dataset_one_weight ] if FLAGS.dataset_two_weight != 0: dset_weights = [ FLAGS.dataset_one_weight, FLAGS.dataset_two_weight, 1 - FLAGS.dataset_one_weight + FLAGS.dataset_two_weight ] d = tf.data.experimental.sample_from_datasets( dataset_list, dset_weights) # Note that sample_from_datasets() inserts randomness into the training # An alternative would be to use choose_from_datasets() but then the # order must be stated explicitly which is less intitive for unbalanced # datasets. Example below: # # choice_dataset = tf.data.Dataset.range(len(dataset_list)).repeat() # d = tf.data.experimental.choose_from_datasets(dataset_list, # choice_dataset) if is_training: d = d.shuffle(buffer_size=256) # The window size will be for selecting negative samples # It equals the number of documents to sample from -1 d = d.apply( contrib_data.sliding_window_batch( window_size=FLAGS.data_window_size, window_shift=FLAGS.data_window_shift)) d = d.apply( tf.data.experimental.map_and_batch(lambda record: _decode_record( record, name_to_features, vocab_table), batch_size=batch_size, drop_remainder=drop_remainder)) return d
def load_vocab(infilename): v = arguments.pop('vocab', None) if v is None: return print("Loading Vocabulary {0}".format(v)) table = lookup.index_table_from_file(vocabulary_file=infilename, num_oov_buckets=1, vocab_size=None, default_value=-1) numbers = table.lookup(tf.constant('quick fox the not blah blah'.split())) with tf.Session() as sess: tf.tables_initializer().run() print "{} --> {}".format(LINES[0], numbers.eval())
def test_stale_asset_collections_are_cleaned(self): vocabulary_file = os.path.join(compat.as_bytes(test.get_temp_dir()), compat.as_bytes('asset')) file_io.write_string_to_file(vocabulary_file, 'foo bar baz') export_path = os.path.join(tempfile.mkdtemp(), 'export') # create a SavedModel including assets with tf.Graph().as_default(): with tf.Session().as_default() as session: input_string = tf.placeholder(tf.string) # Map string through a table loaded from an asset file table = lookup.index_table_from_file(vocabulary_file, num_oov_buckets=12, default_value=12) output = table.lookup(input_string) inputs = {'input': input_string} outputs = {'output': output} saved_transform_io.write_saved_transform_from_session( session, inputs, outputs, export_path) # Load it and save it again repeatedly, verifying that the asset collections # remain valid. for _ in [1, 2, 3]: with tf.Graph().as_default() as g: with tf.Session().as_default() as session: input_string = tf.constant('dog') inputs = {'input': input_string} _, outputs = (saved_transform_io. partially_apply_saved_transform_internal( export_path, inputs)) self.assertEqual( 1, len(g.get_collection(ops.GraphKeys.ASSET_FILEPATHS))) self.assertEqual( 0, len( g.get_collection( tf.saved_model.constants.ASSETS_KEY))) # Check that every ASSET_FILEPATHS refers to a Tensor in the graph. # If not, get_tensor_by_name() raises KeyError. for asset_path in g.get_collection( ops.GraphKeys.ASSET_FILEPATHS): tensor_name = asset_path.name g.get_tensor_by_name(tensor_name) export_path = os.path.join(tempfile.mkdtemp(), 'export') saved_transform_io.write_saved_transform_from_session( session, inputs, outputs, export_path)
def cnn_model(features, target, mode): table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1) # string operations titles = tf.squeeze(features['title'], [1]) words = tf.string_split(titles) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table.lookup(densewords) padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH]) print('words_sliced={}'.format(words)) # (?, 20) # layer to take the words and convert them into vectors (embeddings) embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE) print('words_embed={}'.format(embeds)) # (?, 20, 10) # now do convolution conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1) conv = tf.nn.relu(conv) # (?, 4, 1) words = tf.squeeze(conv, [2]) # (?, 4) print('words_conv={}'.format(words)) # (?, 4) n_classes = len(TARGETS) logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None) #print('logits={}'.format(logits)) # (?, 3) predictions_dict = { 'source': tf.gather(TARGETS, tf.argmax(logits, 1)), 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) } if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(target, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tflearn.ModelFnOps( mode=mode, predictions=predictions_dict, loss=loss, train_op=train_op)
def train_input_fn(): def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 filename = "train.csv" filename_queue = tf.train.string_input_producer([filename]) reader = tf.TextLineReader() _, value = reader.read(filename_queue) record_defaults = DEFAULTS col1, col2 = tf.decode_csv(value, record_defaults=record_defaults, field_delim='|') label = tf.stack([col1]) features = tf.stack([col2]) table = tf.contrib.lookup.index_table_from_tensor( mapping=tf.constant(TARGETS), num_oov_buckets=0, default_value=-1) labels = table.lookup(label) table2 = lookup.index_table_from_file(vocabulary_file='vocab.tsv', num_oov_buckets=1, vocab_size=None, default_value=-1) #look strings up in the vocabulary words = tf.string_split(features) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table2.lookup(densewords) #pads vectors out to MAX_DOCUMENT_LENGTH padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH]) shaped = tf.reshape(sliced, [1735]) batch_size = file_len(filename) min_after_dequeue = 10000 capacity = min_after_dequeue + 3 * batch_size features, labels = tf.train.shuffle_batch( [shaped, labels], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue) return features, labels
def get_embedding(hparams, titles, embed_size): table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, default_value=-1) # string operations words = tf.string_split(titles) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table.lookup(densewords) padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0,0], [-1, MAX_DOCUMENT_LENGTH]) #print('words_sliced={}'.format(words)) # (?, 20) # layer to take the words and convert them into vectors (embeddings) embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=embed_size) #print('words_embed={}'.format(embeds)) # (?, 20, 10) return embeds
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) filter_sizes = [3, 5] conv_pools = [] for text_embedding in [f1, f2, f3]: for filter_size in filter_sizes: l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding) l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero) l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv) conv_pools.append(l_pool) merged = tf.keras.layers.Concatenate(axis=1)(conv_pools) dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged) dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1) logits = tf.keras.layers.Dense(1, activation=None)(dense2) if labels is not None: labels = tf.reshape(labels, [-1, 1]) optimizer = tf.train.AdamOptimizer() def _train_op_fn(loss): return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits, train_op_fn=_train_op_fn)
def serving_fn(): input_string = tf.placeholder(dtype=tf.string, shape=None) receiver_tensor = { 'sms_input': input_string } # word_id_vector = tf.map_fn(fn=map_serving, elems=input_string) vocab_table = lookup.index_table_from_file(vocabulary_file='vocab.csv', num_oov_buckets=1, default_value=-1) words = tf.string_split(input_string) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, max_words]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_words]) features = {'sms_input': word_id_vector} return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) f1 = tf.keras.layers.Embedding(params.N_WORDS, 50, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f1 = tf.keras.layers.Dropout(0.2)(f1) f1 = tf.keras.layers.ZeroPadding1D((49, 49))(f1) f1 = tf.keras.layers.Conv1D(64, 50, padding='same', activation=None, strides=1)(f1) f1 = KMaxPooling(k=9, axis=1)(f1) f1 = tf.keras.layers.ZeroPadding1D((24, 24))(f1) f1 = tf.keras.layers.Conv1D(64, 25, padding='same', activation=None, strides=1)(f1) f1 = Folding()(f1) f1 = KMaxPooling(k=9, axis=1)(f1) f1 = tf.keras.layers.Activation('relu')(f1) f1 = tf.keras.layers.Flatten()(f1) f1 = tf.keras.layers.Dropout(0.2)(f1) logits = tf.keras.layers.Dense(1, activation=None)(f1) if labels is not None: labels = tf.reshape(labels, [-1, 1]) optimizer = tf.train.AdamOptimizer() def _train_op_fn(loss): return optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return head.create_estimator_spec(features=features, labels=labels, mode=mode, logits=logits, train_op_fn=_train_op_fn)
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. vocab_table = contrib_lookup.index_table_from_file(FLAGS.vocab_file) d = tf.data.TFRecordDataset(input_file) if is_training: d = d.repeat() d = d.shuffle(buffer_size=256) d = d.apply( tf.data.experimental.map_and_batch(lambda record: _decode_record( record, name_to_features, vocab_table), batch_size=batch_size, drop_remainder=drop_remainder)) return d
def serving_fn(): input_string = tf.placeholder(dtype=tf.string, shape=None) receiver_tensor = {'sms_input': input_string} # word_id_vector = tf.map_fn(fn=map_serving, elems=input_string) vocab_table = lookup.index_table_from_file(vocabulary_file='vocab.csv', num_oov_buckets=1, default_value=-1) words = tf.string_split(input_string) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, max_words]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, max_words]) features = {'sms_input': word_id_vector} return tf.estimator.export.ServingInputReceiver(features, receiver_tensor)
def linear_model(features, target, mode): # make input features numeric from tensorflow.contrib import lookup table = lookup.index_table_from_file(vocabulary_file=WORD_VOCAB_FILE, num_oov_buckets=1, vocab_size=N_WORDS, default_value=-1, name="word_to_index") word_indexes = table.lookup(features['title']) word_vectors = tf.contrib.layers.embed_sequence(word_indexes, vocab_size=(N_WORDS + 1), embed_dim=EMBEDDING_SIZE, scope='words') n_classes = len(TARGETS) logits = tf.contrib.layers.fully_connected(word_vectors, n_classes, activation_fn=None) logits = tf.squeeze(logits, squeeze_dims=[1]) # from (?,1,3) to (?,3) predictions_dict = { 'source': tf.gather(TARGETS, tf.argmax(logits, 1)), 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) } if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(target, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tflearn.ModelFnOps(mode=mode, predictions=predictions_dict, loss=loss, train_op=train_op)
def cnn_model(features, labels, mode): # convert vocab to numbers table = lookup.index_table_from_file(vocabulary_file='vocab.tsv', num_oov_buckets=1, vocab_size=None, default_value=-1) #Looks up specific terms 'Some title' # numbers = table.lookup(tf.constant('Some title'.split())) # with tf.Session() as sess: # tf.tables_initializer().run() # print("{} --> {}".format(lines[0], numbers.eval())) #create sparse vectors, convert to dense and look vectors up in the dictionary # titles = tf.squeeze(features['Review Text'], [1]) words = tf.string_split(features) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table.lookup(densewords) #Shows dense word vectors # sess = tf.Session() #sess.run(densewords) #Shows vectors of words where dictionary is applied #table.init.run(session=sess) #print(numbers.eval(session=sess)) #pads vectors out to MAX_DOCUMENT_LENGTH padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH]) # sess.run(sliced) #create embeddings embeds = tf.contrib.layers.embed_sequence(sliced, vocab_size=N_WORDS, embed_dim=EMBEDDING_SIZE) #print('words_embed={}'.format(embeds)) # (?, 20, 10) #Convolutions!!! conv = tf.contrib.layers.conv2d(embeds, 1, WINDOW_SIZE, stride=STRIDE, padding='SAME') # (?, 4, 1) conv = tf.nn.relu(conv) # (?, 4, 1) words = tf.squeeze(conv, [2]) # (?, 4) logits = tf.contrib.layers.fully_connected(words, n_classes, activation_fn=None) correctPred = tf.equal(tf.argmax(logits, 1), labels) accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float64)) tf.summary.scalar('Accuracy', accuracy) merged = tf.summary.merge_all() if mode == tf.contrib.learn.ModeKeys.TRAIN or mode == tf.contrib.learn.ModeKeys.EVAL: loss = tf.losses.sparse_softmax_cross_entropy(labels, logits) train_op = tf.contrib.layers.optimize_loss( loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) else: loss = None train_op = None return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) word_embeddings = layers.embed_sequence(word_id_vector, vocab_size=params.N_WORDS, embed_dim=50) min_vectors = tf.reduce_min(word_embeddings, axis=1) max_vectors = tf.reduce_max(word_embeddings, axis=1) min_max_vectors = tf.concat([min_vectors, max_vectors], axis=1) d1 = tf.keras.layers.Dense(25, activation='relu')(min_max_vectors) logits = tf.keras.layers.Dense(commons.TARGET_SIZE)(d1) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': tf.gather(commons.TARGET_LABELS, predicted_indices), 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) weights = features[commons.WEIGHT_COLUNM_NAME] loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices, weights=weights), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices, weights=weights), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices, weights=weights), 'f1_score': streaming_f1(labels=labels, predictions=predicted_indices) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def fast_text_model_fn(self, features, labels, mode, params): vocab_table = lookup.index_table_from_file( vocabulary_file=self.VOCAB_FILE, num_oov_buckets=1, default_value=-1) text = features[self.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=self.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, self.MAX_LEN]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) with tf.name_scope('embedding'): embedding_vectors = layers.embed_sequence( word_id_vector, vocab_size=self.VOCAB_LEN, embed_dim=self.EMBED_DIM, initializer=layers.xavier_initializer(seed=42)) tf.logging.info('Word Vectors = {}'.format(embedding_vectors)) with tf.name_scope('fast_text'): average_vectors = tf.reduce_sum(embedding_vectors, axis=1) tf.logging.info( 'Average Word Vectors = {}'.format(average_vectors)) with tf.name_scope('hidden_layer'): fc1 = tf.keras.layers.Dense(1024, activation='relu')(average_vectors) d1 = tf.keras.layers.Dropout(0.5)(fc1) fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2, activation='relu')(d1) d2 = tf.keras.layers.Dropout(0.5)(fc2) tf.logging.info('Hidden Layer = {}'.format(d2)) with tf.name_scope('output'): logits = tf.keras.layers.Dense(self.TARGET_SIZE, activation=None)(d2) tf.logging.info('Logits Layer = {}'.format(logits)) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) tf.summary.histogram('fasttext', average_vectors) tf.summary.histogram('softmax', probabilities) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': predicted_indices, 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices), 'f1_score': self.streaming_f1(labels=labels, predictions=predicted_indices, n_classes=self.TARGET_SIZE) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def main(): input = [["emersoN", "lAke", "aNd", "palmer"], ["i", "haVe", "a", "343yaCht123", "m%an", "2543"]] sentences_padded, _ = pad_sequences(input, '') sentences = tf.constant(sentences_padded) lowercase_sentences = lowercase(sentences) table = lookup.index_table_from_tensor(mapping=tf.constant(['']), default_value=1) sequence_lengths = tf.reduce_sum(table.lookup(sentences), 1) word_table = lookup.index_table_from_file(vocabulary_file="data/words.txt", num_oov_buckets=1) char_table = lookup.index_table_from_file(vocabulary_file="data/chars.txt", default_value=-1) sentences_shape = tf.shape(sentences, out_type=tf.int64) # We need to remove chars not in vocab removed_char_sentences = remove_unknown_chars(sentences, char_table) split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]), delimiter="") dense_split_words = tf.sparse_tensor_to_dense(split_words, default_value='') max_word_len = tf.gather_nd(split_words.dense_shape, [1]) chars_shape = tf.concat([sentences_shape, [max_word_len]], 0) chars = tf.reshape(dense_split_words, chars_shape) word_lengths = tf.reduce_sum(table.lookup(chars), 2) word_ids = word_table.lookup(sentences) char_ids = char_table.lookup(chars) word_mask = tf.sequence_mask(sequence_lengths) word_ids = tf.where(word_mask, word_ids, tf.zeros_like(word_ids)) char_mask = tf.sequence_mask(word_lengths) char_ids = tf.where(char_mask, char_ids, tf.zeros_like(char_ids)) config = Config() # build model model = NERModel(config) model.build() dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter) train = CoNLLDataset(config.filename_train, max_iter=config.max_iter) batch_size = model.config.batch_size # iterate over dataset for i, (words, labels) in enumerate(minibatches(train, batch_size)): print "Start" fd, _ = model.get_feed_dict(words, labels, model.config.lr, model.config.dropout) _, train_loss = model.sess.run([model.train_op, model.loss], feed_dict=fd) print "train loss", train_loss metrics = model.run_evaluate(dev) msg = " - ".join( ["{} {:04.2f}".format(k, v) for k, v in metrics.items()]) print msg
def fast_text_model_fn(self, features, labels, mode, params): vocab_table = lookup.index_table_from_file(vocabulary_file=self.VOCAB_FILE, num_oov_buckets=1, default_value=-1) text = features[self.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=self.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, self.MAX_LEN]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, self.MAX_LEN]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) with tf.name_scope('embedding'): embedding_vectors = layers.embed_sequence(word_id_vector, vocab_size=self.VOCAB_LEN, embed_dim=self.EMBED_DIM, initializer=layers.xavier_initializer(seed=42)) tf.logging.info('Word Vectors = {}'.format(embedding_vectors)) with tf.name_scope('fast_text'): average_vectors = tf.reduce_sum(embedding_vectors, axis=1) tf.logging.info('Average Word Vectors = {}'.format(average_vectors)) with tf.name_scope('hidden_layer'): fc1 = tf.keras.layers.Dense(1024, activation='relu')(average_vectors) d1 = tf.keras.layers.Dropout(0.5)(fc1) fc2 = tf.keras.layers.Dense(self.EMBED_DIM / 2, activation='relu')(d1) d2 = tf.keras.layers.Dropout(0.5)(fc2) tf.logging.info('Hidden Layer = {}'.format(d2)) with tf.name_scope('output'): logits = tf.keras.layers.Dense(self.TARGET_SIZE, activation=None)(d2) tf.logging.info('Logits Layer = {}'.format(logits)) probabilities = tf.nn.softmax(logits) predicted_indices = tf.argmax(probabilities, axis=1) tf.summary.histogram('fasttext', average_vectors) tf.summary.histogram('softmax', probabilities) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class': predicted_indices, 'probabilities': probabilities } exported_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=exported_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(predicted_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_indices), 'precision': tf.metrics.precision(labels=labels, predictions=predicted_indices), 'recall': tf.metrics.recall(labels=labels, predictions=predicted_indices), 'f1_score': self.streaming_f1(labels=labels, predictions=predicted_indices, n_classes=self.TARGET_SIZE) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def model_fn(features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) f1 = tf.keras.layers.Embedding(params.N_WORDS, 100, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f2 = tf.keras.layers.Embedding(params.N_WORDS, 200, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) f3 = tf.keras.layers.Embedding(params.N_WORDS, 300, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) filter_sizes = [3, 5] conv_pools = [] for text_embedding in [f1, f2, f3]: for filter_size in filter_sizes: l_zero = tf.keras.layers.ZeroPadding1D((filter_size - 1, filter_size - 1))(text_embedding) l_conv = tf.keras.layers.Conv1D(filters=32, kernel_size=filter_size, padding='same', activation='tanh')(l_zero) l_pool = tf.keras.layers.GlobalMaxPool1D()(l_conv) conv_pools.append(l_pool) merged = tf.keras.layers.Concatenate(axis=1)(conv_pools) dense1 = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(merged) dense2 = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense1) logits = tf.keras.layers.Dense(2, activation=None)(dense2) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': prediction_indices, # tf.gather(commons.TARGET_LABELS, prediction_indices), 'class_index': prediction_indices, 'probabilities': predictions } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.summary.scalar('loss', loss) acc = tf.equal(tf.cast(prediction_indices, dtype=tf.int64), tf.cast(labels, dtype=tf.int64)) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
reader = tf.TextLineReader() _, value = reader.read(filename_queue) record_defaults = DEFAULTS col1, col2 = tf.decode_csv(value, record_defaults=record_defaults, field_delim='|') label = tf.stack([col1]) features = tf.stack([col2]) table = tf.contrib.lookup.index_table_from_tensor(mapping=tf.constant(TARGETS), num_oov_buckets=0, default_value=-1) labels = table.lookup(label) table2 = lookup.index_table_from_file(vocabulary_file='vocab.tsv', num_oov_buckets=1, vocab_size=None, default_value=-1) #look strings up in the vocabulary words = tf.string_split(features) densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD) numbers = table2.lookup(densewords) #pads vectors out to MAX_DOCUMENT_LENGTH padding = tf.constant([[0, 0], [0, MAX_DOCUMENT_LENGTH]]) padded = tf.pad(numbers, padding) sliced = tf.slice(padded, [0, 0], [-1, MAX_DOCUMENT_LENGTH]) shaped = tf.reshape(sliced, [MAX_DOCUMENT_LENGTH]) # shaped = tf.expand_dims(shaped, -1) # batch_size = file_len(filename)
def model_fn(features, labels, mode, params): ''' CNN model based on Yoon Kim https://arxiv.org/pdf/1408.5882.pdf :param features: :param labels: :param mode: :param params: :return: ''' vocab_table = lookup.index_table_from_file( vocabulary_file='dataset/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) embedded_sequences = tf.keras.layers.Embedding( params.N_WORDS, 128, input_length=commons.MAX_DOCUMENT_LENGTH)(word_id_vector) conv_layer = [] for filter_size in filter_sizes: l_conv = tf.keras.layers.Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences) l_pool = tf.keras.layers.MaxPooling1D(pool_size=3)(l_conv) conv_layer.append(l_pool) l_merge = tf.keras.layers.concatenate(conv_layer, axis=1) conv = tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu')(l_merge) pool = tf.keras.layers.MaxPooling1D(pool_size=3)(conv) f1 = tf.keras.layers.Dropout(0.5)(pool) f1 = tf.keras.layers.Flatten()(f1) f1 = tf.keras.layers.Dense(128, activation='relu')(f1) logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': prediction_indices, 'probabilities': predictions } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) weights = features[commons.WEIGHT_COLUNM_NAME] loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights) tf.summary.scalar('loss', loss) acc = tf.equal(prediction_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices, weights=weights), 'precision': tf.metrics.precision(labels=labels, predictions=prediction_indices, weights=weights), 'recall': tf.metrics.recall(labels=labels, predictions=prediction_indices, weights=weights), 'f1_score': streaming_f1(labels=labels, predictions=prediction_indices, n_classes=commons.TARGET_SIZE, weights=None) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def model_fn(features, labels, mode, params): ''' CNN model based on Yoon Kim https://arxiv.org/pdf/1408.5882.pdf :param features: :param labels: :param mode: :param params: :return: ''' vocab_table = lookup.index_table_from_file(vocabulary_file='dataset/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) if mode == tf.estimator.ModeKeys.TRAIN: tf.keras.backend.set_learning_phase(True) else: tf.keras.backend.set_learning_phase(False) embedded_sequences = tf.keras.layers.Embedding(params.N_WORDS, 128, input_length=commons.MAX_DOCUMENT_LENGTH)( word_id_vector) conv_layer = [] for filter_size in filter_sizes: l_conv = tf.keras.layers.Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences) l_pool = tf.keras.layers.MaxPooling1D(pool_size=3)(l_conv) conv_layer.append(l_pool) l_merge = tf.keras.layers.concatenate(conv_layer, axis=1) conv = tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu')(l_merge) pool = tf.keras.layers.MaxPooling1D(pool_size=3)(conv) f1 = tf.keras.layers.Dropout(0.5)(pool) f1 = tf.keras.layers.Flatten()(f1) f1 = tf.keras.layers.Dense(128, activation='relu')(f1) logits = tf.keras.layers.Dense(commons.TARGET_SIZE, activation=None)(f1) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': prediction_indices, 'probabilities': predictions } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) weights = features[commons.WEIGHT_COLUNM_NAME] tf.logging.info('Logits Layer = {}'.format(logits)) tf.logging.info('Logits Layer = {}'.format(labels)) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights) tf.summary.scalar('loss', loss) acc = tf.equal(prediction_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices, weights=weights), 'precision': tf.metrics.precision(labels=labels, predictions=prediction_indices, weights=weights), 'recall': tf.metrics.recall(labels=labels, predictions=prediction_indices, weights=weights), 'f1_score': streaming_f1(labels=labels, predictions=prediction_indices, n_classes=commons.TARGET_SIZE, weights=None) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def model_fn(features, labels, mode, params): vocab_table = lookup.index_table_from_file(vocabulary_file='data/vocab.csv', num_oov_buckets=1, default_value=-1) text = features[commons.FEATURE_COL] words = tf.string_split(text) dense_words = tf.sparse_tensor_to_dense(words, default_value=commons.PAD_WORD) word_ids = vocab_table.lookup(dense_words) padding = tf.constant([[0, 0], [0, commons.MAX_DOCUMENT_LENGTH]]) # Pad all the word_ids entries to the maximum document length word_ids_padded = tf.pad(word_ids, padding) word_id_vector = tf.slice(word_ids_padded, [0, 0], [-1, commons.MAX_DOCUMENT_LENGTH]) word_id_vector = {commons.FEATURE_COL: word_id_vector} bow_column = tf.feature_column.categorical_column_with_identity(commons.FEATURE_COL, num_buckets=params.N_WORDS) bow_embedding_column = tf.feature_column.embedding_column(bow_column, dimension=50, combiner='sqrtn') bow = tf.feature_column.input_layer(word_id_vector, feature_columns=[bow_embedding_column]) logits = tf.layers.dense(bow, 2, activation=None) predictions = tf.nn.softmax(logits) prediction_indices = tf.argmax(predictions, axis=1) if mode == tf.estimator.ModeKeys.PREDICT: prediction_dict = { 'class': tf.gather(commons.TARGET_LABELS, prediction_indices), 'probabilities': predictions } export_outputs = { 'predictions': tf.estimator.export.PredictOutput(prediction_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) weights = features[commons.WEIGHT_COLUNM_NAME] print(tf.shape(labels)) print(tf.shape(logits)) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights) tf.summary.scalar('loss', loss) acc = tf.equal(prediction_indices, labels) acc = tf.reduce_mean(tf.cast(acc, tf.float32)) tf.summary.scalar('acc', acc) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss=loss) if mode == tf.estimator.ModeKeys.EVAL: eval_metrics_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=prediction_indices, weights=weights), 'precision': tf.metrics.precision(labels=labels, predictions=prediction_indices, weights=weights), 'recall': tf.metrics.recall(labels=labels, predictions=prediction_indices, weights=weights), 'f1_score': streaming_f1(labels=labels, predictions=prediction_indices, n_classes=commons.TARGET_SIZE, weights=None) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics_ops)
def _model_fn(self, features, labels, mode, params): ''' :param features: TF Placeholder of type String of shape [BATCH_SIZE, 1] :param labels: TF Placeholder of type String of shape [BATCH_SIZE, 1] :param mode: ModeKeys :param params: :return: ''' is_training = mode == ModeKeys.TRAIN # [BATCH_SIZE, 1] text_features = features[self.FEATURE_1_NAME] positional_features = features[self.FEATURE_3_NAME] if self.ner_config.USE_CHAR_EMBEDDING: # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH] char_ids = features[self.FEATURE_2_NAME] tf.logging.info('char_ids: =======> {}'.format(char_ids)) s = tf.shape(char_ids) # remove pad words char_ids_reshaped = tf.reshape(char_ids, shape=(s[0] * s[1], s[2])) # 20 -> char dim with tf.variable_scope("sentence-words-2-ids"): word_table = lookup.index_table_from_file( vocabulary_file=self.ner_config.WORDS_VOCAB_FILE, num_oov_buckets=0, # TODO use this for Out of Vocab default_value=1, # id of <UNK> w.r.t WORD VOCAB name="table") tf.logging.info('table info: {}'.format(word_table)) # [BATCH_SIZE, 1] words = tf.string_split(text_features, delimiter=SEPERATOR) # [BATCH_SIZE, ?] i.e [BATCH_SIZE, VARIABLE_SEQ_LENGTH] densewords = tf.sparse_tensor_to_dense( words, default_value=self.ner_config.PAD_WORD) # TODO add test case # [BATCH_SIZE, ?] i.e [BATCH_SIZE, MAX_SEQ_LENGTH] token_ids = word_table.lookup( densewords) # TODO check is it variable length or not? tf.logging.info('token_ids_shape: ------> {}'.format( token_ids.shape[1])) tf.logging.info('densewords_shape: ------> {}'.format( densewords.shape)) tf.logging.info( "positional_shape: ---->{}".format(positional_features)) with tf.variable_scope("ner-tags-2-ids"): if mode != ModeKeys.INFER: ner_table = lookup.index_table_from_file( vocabulary_file=self.ner_config.TAGS_VOCAB_FILE, num_oov_buckets=0, default_value=0, # id of <UNK> w.r.t ENTITY VOCAB name="table") tf.logging.info('ner_table info: {}'.format(ner_table)) # [BATCH_SIZE, 1] labels_splitted = tf.string_split(labels, delimiter=SEPERATOR) # [BATCH_SIZE, ?] i.e [BATCH_SIZE, VARIABLE_SEQ_LENGTH] labels_splitted_dense = tf.sparse_tensor_to_dense( labels_splitted, default_value="O") # [BATCH_SIZE, ?] i.e [BATCH_SIZE, MAX_SEQ_LENGTH] ner_ids = ner_table.lookup(labels_splitted_dense) ner_ids = tf.cast(ner_ids, tf.int32) tf.logging.info("ner_ids: {}".format(ner_ids)) with tf.variable_scope("word-embed-layer"): # layer to take the words and convert them into vectors (embeddings) # This creates embeddings matrix of [VOCAB_SIZE, EMBEDDING_SIZE] and then # maps word indexes of the sequence into # [BATCH_SIZE, MAX_SEQ_LENGTH] ---> [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE]. word_embeddings = tf.contrib.layers.embed_sequence( token_ids, vocab_size=self.ner_config.VOCAB_SIZE, embed_dim=self.ner_config.WORD_EMBEDDING_SIZE, initializer=tf.contrib.layers.xavier_initializer(seed=42)) tf.logging.info('positional_features_length =====> {}'.format( positional_features.shape)) tf.logging.info('word_embeddings_shape: ------> {}'.format( word_embeddings.shape)) # word_embeddings = tf.concat([ word_embeddings, positional_features], axis=-1) tf.logging.info( 'word_embeddings: ------> {}'.format(word_embeddings)) word_embeddings = tf.layers.dropout( word_embeddings, rate=self.ner_config.KEEP_PROP, seed=42, training=mode == tf.estimator.ModeKeys.TRAIN) # [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE] tf.logging.info( 'word_embeddings =====> {}'.format(word_embeddings)) # seq_length = get_sequence_length_old(word_embeddings) TODO working # [BATCH_SIZE, ] seq_length = get_sequence_length(token_ids) tf.logging.info('seq_length =====> {}'.format(seq_length)) with tf.variable_scope("char_embed_layer"): if self.ner_config.USE_CHAR_EMBEDDING: print_error((self.ner_config.CHAR_VOCAB_SIZE, self.ner_config.CHAR_EMBEDDING_SIZE)) char_embeddings = tf.contrib.layers.embed_sequence( char_ids, vocab_size=self.ner_config.CHAR_VOCAB_SIZE, embed_dim=self.ner_config.CHAR_EMBEDDING_SIZE, initializer=tf.contrib.layers.xavier_initializer(seed=42)) # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE] char_embeddings = tf.layers.dropout( char_embeddings, rate=self.ner_config.KEEP_PROP, seed=42, training=mode == tf.estimator.ModeKeys.TRAIN) # TODO add test case tf.logging.info( 'char_embeddings =====> {}'.format(char_embeddings)) with tf.variable_scope("chars_level_bilstm_layer"): if self.ner_config.USE_CHAR_EMBEDDING: # put the time dimension on axis=1 shape = tf.shape(char_embeddings) BATCH_SIZE = shape[0] MAX_DOC_LENGTH = shape[1] CHAR_MAX_LENGTH = shape[2] TOTAL_DOCS_LENGTH = tf.reduce_sum(seq_length) # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE] ===> # [BATCH_SIZE * MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE] char_embeddings = tf.reshape( char_embeddings, shape=[ BATCH_SIZE * MAX_DOC_LENGTH, CHAR_MAX_LENGTH, self.ner_config.CHAR_EMBEDDING_SIZE ], name="reduce_dimension_1") tf.logging.info('reshaped char_embeddings =====> {}'.format( char_embeddings)) # word_lengths = get_sequence_length_old(char_embeddings) TODO working word_lengths = get_sequence_length(char_ids_reshaped) tf.logging.info('word_lengths =====> {}'.format(word_lengths)) # bi lstm on chars cell_fw = tf.contrib.rnn.LSTMCell( self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell( self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE, state_is_tuple=True) _output = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, dtype=tf.float32, sequence_length=word_lengths, inputs=char_embeddings, scope="encode_words") # read and concat output _, ((_, output_fw), (_, output_bw)) = _output encoded_words = tf.concat([output_fw, output_bw], axis=-1) # [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE] encoded_words = tf.reshape( encoded_words, shape=[ BATCH_SIZE, MAX_DOC_LENGTH, 2 * self.ner_config.CHAR_LEVEL_LSTM_HIDDEN_SIZE ]) tf.logging.info( 'encoded_words =====> {}'.format(encoded_words)) with tf.variable_scope("word_level_lstm_layer"): # Create a LSTM Unit cell with hidden size of EMBEDDING_SIZE. d_rnn_cell_fw_one = tf.nn.rnn_cell.LSTMCell( self.ner_config.WORD_LEVEL_LSTM_HIDDEN_SIZE, state_is_tuple=True) d_rnn_cell_bw_one = tf.nn.rnn_cell.LSTMCell( self.ner_config.WORD_LEVEL_LSTM_HIDDEN_SIZE, state_is_tuple=True) if is_training: d_rnn_cell_fw_one = tf.contrib.rnn.DropoutWrapper( d_rnn_cell_fw_one, output_keep_prob=self.ner_config.KEEP_PROP) d_rnn_cell_bw_one = tf.contrib.rnn.DropoutWrapper( d_rnn_cell_bw_one, output_keep_prob=self.ner_config.KEEP_PROP) else: d_rnn_cell_fw_one = tf.contrib.rnn.DropoutWrapper( d_rnn_cell_fw_one, output_keep_prob=1.0) d_rnn_cell_bw_one = tf.contrib.rnn.DropoutWrapper( d_rnn_cell_bw_one, output_keep_prob=1.0) d_rnn_cell_fw_one = tf.nn.rnn_cell.MultiRNNCell( cells=[d_rnn_cell_fw_one] * self.ner_config.NUM_LSTM_LAYERS, state_is_tuple=True) d_rnn_cell_bw_one = tf.nn.rnn_cell.MultiRNNCell( cells=[d_rnn_cell_bw_one] * self.ner_config.NUM_LSTM_LAYERS, state_is_tuple=True) (fw_output_one, bw_output_one), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=d_rnn_cell_fw_one, cell_bw=d_rnn_cell_bw_one, dtype=tf.float32, sequence_length=seq_length, inputs=word_embeddings, scope="encod_sentence") # [BATCH_SIZE, MAX_SEQ_LENGTH, 2*WORD_LEVEL_LSTM_HIDDEN_SIZE) TODO check MAX_SEQ_LENGTH? encoded_sentence = tf.concat([fw_output_one, bw_output_one], axis=-1) tf.logging.info( 'encoded_sentence =====> {}'.format(encoded_sentence)) #================================================================================================ with tf.variable_scope("positional_lstm_layer"): positional_features = tf.layers.batch_normalization( positional_features) tf.logging.info( 'positional_features =====> {}'.format(positional_features)) tf.logging.info('encoded_words =====> {}'.format(encoded_words)) tf.logging.info( 'encoded_sentence =====> {}'.format(encoded_sentence)) with tf.variable_scope("char_word_embeddings-mergeing_layer"): if self.ner_config.USE_CHAR_EMBEDDING: encoded_doc = tf.concat( [encoded_words, encoded_sentence, positional_features], axis=-1) else: encoded_doc = tf.concat( [encoded_sentence, positional_features], axis=-1) # [BATCH_SIZE, MAX_SEQ_LENGTH, 2*WORD_LEVEL_LSTM_HIDDEN_SIZE + 2*CHAR_LEVEL_LSTM_HIDDEN_SIZE] encoded_doc = tf.layers.dropout( encoded_doc, rate=self.ner_config.KEEP_PROP, seed=42, training=mode == tf.estimator.ModeKeys.TRAIN) tf.logging.info('encoded_doc: =====> {}'.format(encoded_doc)) with tf.variable_scope("projection"): logits = tf.layers.dense(encoded_doc, self.ner_config.NUM_TAGS, name="logit_predictions") tf.logging.info("logits: {}".format(logits)) with tf.variable_scope("loss-layer"): """Defines the loss""" if mode == ModeKeys.INFER: ner_ids = tf.placeholder( tf.int32, shape=[None, None], name="labels") # no labels during prediction else: ner_ids = ner_ids if True: # self.config.use_crf: log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood( logits, ner_ids, seq_length) tf.logging.info( "log_likelihood: =====> {}".format(log_likelihood)) # [NUM_TAGS, NUM_TAGS] trans_params = trans_params # need to evaluate it for decoding tf.logging.info("trans_params: =====> {}".format(trans_params)) ner_crf_loss = tf.reduce_mean(-log_likelihood) tf.summary.scalar("loss", ner_crf_loss) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=ner_ids) mask = tf.sequence_mask(seq_length) losses = tf.boolean_mask(losses, mask) ner_crf_loss = tf.reduce_mean(losses) tf.summary.scalar("loss", ner_crf_loss) viterbi_seq, best_score = tf.contrib.crf.crf_decode( logits, trans_params, seq_length) tf.logging.info("viterbi_seq: {}".format(viterbi_seq)) predictions = { "classes": tf.cast(tf.argmax(logits, axis=-1), tf.int32), # [BATCH_SIZE, SEQ_LEN] "viterbi_seq": viterbi_seq, # [BATCH_SIZE] "confidence": tf.reduce_max(tf.nn.softmax(logits, dim=-1), axis=-1), "top_3_indices": tf.nn.top_k(tf.nn.softmax(logits, dim=-1), k=3).indices, "top_3_confidence": tf.nn.top_k(tf.nn.softmax(logits, dim=-1), k=3).values } # Loss, training and eval operations are not needed during inference. loss = None train_op = None eval_metric_ops = {} if mode != ModeKeys.INFER: train_op = tf.contrib.layers.optimize_loss( loss=ner_crf_loss, global_step=tf.train.get_global_step(), optimizer=tf.train.AdamOptimizer, learning_rate=self.ner_config.LEARNING_RATE) loss = ner_crf_loss eval_metric_ops = { 'Accuracy': tf.metrics.accuracy(labels=ner_ids, predictions=predictions["viterbi_seq"], name='accuracy'), 'Precision': tf.metrics.precision(labels=ner_ids, predictions=predictions["viterbi_seq"], name='Precision'), 'Recall': tf.metrics.recall(labels=ner_ids, predictions=predictions["viterbi_seq"], name='Recall') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops, # training_hooks=self.hooks )
def export(): checkpoint_path = FLAGS.checkpoint_path export_path = FLAGS.export_path vocab_path = FLAGS.vocab_path num_steps = FLAGS.num_steps vocab_size = FLAGS.vocab_size embedding_size = FLAGS.embedding_size hidden_size = FLAGS.hidden_size keep_prob = FLAGS.keep_prob num_layers = FLAGS.num_layers num_classes = FLAGS.num_classes prop_limit = FLAGS.prop_limit # split 1-D String dense Tensor to words SparseTensor sentences = tf.placeholder(dtype=tf.string, shape=[None], name='input_sentences') sparse_words = tf.string_split(sentences, delimiter=' ') # slice SparseTensor valid_indices = tf.less(sparse_words.indices, tf.constant([num_steps], dtype=tf.int64)) valid_indices = tf.reshape( tf.split(valid_indices, [1, 1], axis=1)[1], [-1]) valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices) excess_indices = tf.greater_equal(sparse_words.indices, tf.constant([num_steps], dtype=tf.int64)) excess_indices = tf.reshape( tf.split(excess_indices, [1, 1], axis=1)[1], [-1]) excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices) # sparse to dense words = tf.sparse_to_dense( sparse_indices=valid_sparse_words.indices, output_shape=[valid_sparse_words.dense_shape[0], num_steps], sparse_values=valid_sparse_words.values, default_value='_PAD') # dict words to token ids words_table = lookup.index_table_from_file(os.path.join( vocab_path, 'words_vocab.txt'), default_value=3) words_ids = words_table.lookup(words) # blstm model predict with tf.variable_scope('model', reuse=None): logits, _ = ner_model.inference(words_ids, valid_sparse_words.dense_shape[0], num_steps, vocab_size, embedding_size, hidden_size, keep_prob, num_layers, num_classes, is_training=False) props = tf.nn.softmax(logits) max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1) predict_scores = tf.reshape(max_prop_values, shape=[-1, num_steps]) predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, num_steps]) predict_labels_ids = tf.to_int64(predict_labels_ids) # replace untrusted prop that less than prop_limit trusted_prop_flag = tf.greater_equal( predict_scores, tf.constant(prop_limit, dtype=tf.float32)) replace_prop_labels_ids = tf.to_int64( tf.fill(tf.shape(predict_labels_ids), 4)) predict_labels_ids = tf.where(trusted_prop_flag, predict_labels_ids, replace_prop_labels_ids) # dict token ids to labels labels_table = lookup.index_to_string_table_from_file(os.path.join( vocab_path, 'labels_vocab.txt'), default_value='o') predict_labels = labels_table.lookup(predict_labels_ids) # extract real blstm predict label in dense and save to sparse valid_sparse_predict_labels = tf.SparseTensor( indices=valid_sparse_words.indices, values=tf.gather_nd(predict_labels, valid_sparse_words.indices), dense_shape=valid_sparse_words.dense_shape) # create excess label SparseTensor with 'O' excess_sparse_predict_labels = tf.SparseTensor( indices=excess_sparse_words.indices, values=tf.fill(tf.shape(excess_sparse_words.values), 'O'), dense_shape=excess_sparse_words.dense_shape) # concat SparseTensor sparse_predict_labels = tf.SparseTensor( indices=tf.concat(axis=0, values=[ valid_sparse_predict_labels.indices, excess_sparse_predict_labels.indices ]), values=tf.concat(axis=0, values=[ valid_sparse_predict_labels.values, excess_sparse_predict_labels.values ]), dense_shape=excess_sparse_predict_labels.dense_shape) sparse_predict_labels = tf.sparse_reorder(sparse_predict_labels) # join SparseTensor to 1-D String dense Tensor # remain issue, num_split should equal the real size, but here limit to 1 join_labels_list = [] slice_labels_list = tf.sparse_split(sp_input=sparse_predict_labels, num_split=1, axis=0) for slice_labels in slice_labels_list: slice_labels = slice_labels.values join_labels = tf.reduce_join(slice_labels, reduction_indices=0, separator=' ') join_labels_list.append(join_labels) format_predict_labels = tf.stack(join_labels_list) saver = tf.train.Saver() with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: print('read model from {}'.format(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) global_step = int(ckpt.model_checkpoint_path.split('-')[-1]) else: print('No checkpoint file found at %s' % FLAGS.checkpoint_path) return # Export inference model. output_path = os.path.join(export_path, str(global_step)) print 'Exporting trained model to', output_path builder = tf.saved_model.builder.SavedModelBuilder(output_path) # Build the signature_def_map. predict_inputs_tensor_info = tf.saved_model.utils.build_tensor_info( sentences) predict_output_tensor_info = tf.saved_model.utils.build_tensor_info( format_predict_labels) prediction_signature = tf.saved_model.signature_def_utils.build_signature_def( inputs={ 'input_sentences': predict_inputs_tensor_info, }, outputs={'classes': predict_output_tensor_info}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_ner': prediction_signature}, legacy_init_op=legacy_init_op) builder.save() print 'Successfully exported model to %s' % export_path
def apply_vocabulary(x, deferred_vocab_filename_tensor, default_value=-1, num_oov_buckets=0, lookup_fn=None, name=None): r"""Maps `x` to a vocabulary specified by the deferred tensor. This function also writes domain statistics about the vocabulary min and max values. Note that the min and max are inclusive, and depend on the vocab size, num_oov_buckets and default_value. In case one of the tokens contains the '\n' or '\r' characters or is empty it will be discarded since we are currently writing the vocabularies as text files. This behavior will likely be fixed/improved in the future. Args: x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary transformation should be applied. The column names are those intended for the transformed tensors. deferred_vocab_filename_tensor: The deferred vocab filename tensor as returned by `tft.vocabulary`. default_value: The value to use for out-of-vocabulary values, unless 'num_oov_buckets' is greater than zero. num_oov_buckets: Any lookup of an out-of-vocabulary token will return a bucket ID based on its hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the `default_value`. lookup_fn: Optional lookup function, if specified it should take a tensor and a deferred vocab filename as an input and return a lookup `op` along with the table size, by default `apply_vocab` performs a lookup.string_to_index_table_from_file for the table lookup. name: (Optional) A name for this operation. Returns: A `Tensor` or `SparseTensor` where each string value is mapped to an integer. Each unique string value that appears in the vocabulary is mapped to a different integer and integers are consecutive starting from zero, and string value not in the vocabulary is assigned default_value. """ with tf.name_scope(name, 'apply_vocab'): if lookup_fn: result, table_size = lookup_fn(x, deferred_vocab_filename_tensor) else: table = lookup.index_table_from_file( deferred_vocab_filename_tensor, num_oov_buckets=num_oov_buckets, default_value=default_value) table_size = table.size() result = table.lookup(x) # Specify schema overrides which will override the values in the schema # with the min and max values, which are deferred as they are only known # once the analyzer has run. # # `table_size` includes the num oov buckets. The default value is only used # if num_oov_buckets <= 0. min_value = tf.constant(0, tf.int64) max_value = table_size - 1 if num_oov_buckets <= 0: min_value = tf.minimum(min_value, default_value) max_value = tf.maximum(max_value, default_value) schema_inference.set_tensor_schema_override( result.values if isinstance(result, tf.SparseTensor) else result, min_value, max_value) return result