def input_fn_1021(table, selected_cols="user_id,item_id,ui_fea,uu_fea,label", shuffle=True): col_num = len(selected_cols.split(',')) print('input_fn: {}'.format(table)) print('select col: {}'.format(selected_cols)) file_queue = tf.train.string_input_producer([table], num_epochs=NUM_EPOCH, shuffle=shuffle) reader = tf.TableRecordReader(selected_cols=selected_cols) keys, values = reader.read_up_to(file_queue, num_records=BATCH_SIZE) default_val = [[' ']] * col_num default_val[-1] = [-1.0] [user_id, item_id, ui_fea, uu_fea, label] = tf.decode_csv(values, default_val) u_id_hash = tf.string_to_hash_bucket(user_id, NUM_USER_ID) i_id_hash = tf.string_to_hash_bucket(item_id, NUM_ITEM_ID) uu_info_hash = decode_node_list_attr( uu_fea, 5, # uu neigh user_hash_size_list, is_hash=True) ui_info_hash = decode_node_list_attr(ui_fea, 5, item_hash_size_list, is_hash=True) return user_id, item_id, u_id_hash, i_id_hash, label, uu_info_hash, ui_info_hash
def model_fn(features, labels, mode, params): features["text"] = tf.sparse_tensor_to_dense(features["text"], default_value=" ") if FLAGS.use_ngrams: features["ngrams"] = tf.sparse_tensor_to_dense(features["ngrams"], default_value=" ") text_lookup_table = tf.contrib.lookup.index_table_from_file( FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size) text_ids = text_lookup_table.lookup(features["text"]) text_embedding_w = tf.Variable( tf.random_uniform([ FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets, FLAGS.embedding_dimension ], -0.1, 0.1)) text_embedding = tf.reduce_mean(tf.nn.embedding_lookup( text_embedding_w, text_ids), axis=-2) input_layer = text_embedding if FLAGS.use_ngrams: ngram_hash = tf.string_to_hash_bucket(features["ngrams"], FLAGS.num_ngram_buckets) ngram_embedding_w = tf.Variable( tf.random_uniform( [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension], -0.1, 0.1)) ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup( ngram_embedding_w, ngram_hash), axis=-2) ngram_embedding = tf.expand_dims(ngram_embedding, -2) input_layer = tf.concat([text_embedding, ngram_embedding], -1) num_classes = FLAGS.num_labels logits = tf.contrib.layers.fully_connected(inputs=input_layer, num_outputs=num_classes, activation_fn=None) predictions = tf.argmax(logits, axis=-1) probs = tf.nn.softmax(logits) loss, train_op = None, None metrics = {} if mode != tf.estimator.ModeKeys.PREDICT: label_lookup_table = tf.contrib.lookup.index_table_from_file( FLAGS.label_file, vocab_size=FLAGS.num_labels) labels = label_lookup_table.lookup(labels) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)) opt = tf.train.AdamOptimizer(params["learning_rate"]) if FLAGS.horovod: opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, global_step=tf.train.get_global_step()) metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)} exports = {} if FLAGS.export_dir: exports = Exports(probs, text_embedding) return tf.estimator.EstimatorSpec(mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics, export_outputs=exports)
def testStringToOneHashBucketLegacyHash(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket(input_string, 1) result = output.eval(feed_dict={input_string: ['a', 'b', 'c']}) self.assertAllEqual([0, 0, 0], result)
def decode_node_id(info, hash_size, is_hash=False): user_hash_size_list id_val = tf.decode_csv(info, [[" "]]) if is_hash: id_hash_val = tf.string_to_hash_bucket(id_val, hash_size) return id_hash_val return id_val
def testStringToOneHashBucket(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket(input_string, 1) result = output.eval(feed_dict={input_string: ["a", "b", "c"]}) self.assertAllEqual([0, 0, 0], result)
def decode_node_list_attr(infos, node_num, hash_size_list, is_hash=False): """ decode artibrary len node_fea list, e.g., user_friend_list or user_buy_list node_num: num of node in list, e.g., num of user friend """ infos_list = tf.decode_csv(infos, [[" "]] * node_num, chr(3)) infos_fea_list = [ tf.decode_csv(i, [[' ']] * len(hash_size_list), '#') for i in infos_list ] infos_fea_val_list = [ decode_node_attr(node, hash_size_list, is_hash=False) for node in infos_fea_list ] # print('infos_fea_val_list' , infos_fea_val_list) return_list = [[] for i in range(len(hash_size_list))] # print(len(return_list), len(infos_fea_val_list), len(infos_fea_val_list[0])) for x in infos_fea_val_list: for idx, val in enumerate(hash_size_list): return_list[idx].append(x[idx]) # print(return_list, len(return_list)) if is_hash: return_hash_list = [ tf.string_to_hash_bucket(node, hash_size) for node, hash_size in zip(return_list, hash_size_list) ] return return_hash_list
def model_fn(features, labels, mode, params): text_lookup_table = tf.contrib.lookup.index_table_from_file( FLAGS.vocab_file, FLAGS.num_oov_vocab_buckets, FLAGS.vocab_size) text_ids = text_lookup_table.lookup(features["text"]) text_embedding_w = tf.Variable( tf.random_uniform([ FLAGS.vocab_size + FLAGS.num_oov_vocab_buckets, FLAGS.embedding_dimension ], -0.1, 0.1)) text_embedding = tf.reduce_mean(tf.nn.embedding_lookup( text_embedding_w, text_ids), axis=-2) text_embedding = tf.expand_dims(text_embedding, -2) input_layer = text_embedding if FLAGS.use_ngrams: ngram_hash = tf.string_to_hash_bucket(features["ngrams"], FLAGS.num_ngram_buckets) ngram_embedding_w = tf.Variable( tf.random_uniform( [FLAGS.num_ngram_buckets, FLAGS.ngram_embedding_dimension], -0.1, 0.1)) ngram_embedding = tf.reduce_mean(tf.nn.embedding_lookup( ngram_embedding_w, ngram_hash), axis=-2) ngram_embedding = tf.expand_dims(ngram_embedding, -2) input_layer = tf.concat([text_embedding, ngram_embedding], -1) num_classes = len(open(FLAGS.label_file).readlines()) logits = tf.contrib.layers.fully_connected(inputs=input_layer, num_outputs=num_classes, activation_fn=None) predictions = tf.argmax(logits, axis=-1) loss, train_op = None, None metrics = {} if mode != tf.estimator.ModeKeys.PREDICT: loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)) # Squeeze dimensions from labels and switch to 0-offset labels = tf.squeeze(labels, -1) opt = tf.train.AdamOptimizer(params["learning_rate"]) if FLAGS.horovod: opt = hvd.DistributedOptimizer(opt) train_op = opt.minimize(loss, global_step=tf.train.get_global_step()) metrics = {"accuracy": tf.metrics.accuracy(labels, predictions)} exports = {} if FLAGS.export_dir: probs = tf.nn.softmax(logits) exports["proba"] = tf.estimator.export.ClassificationOutput( scores=probs) exports["embedding"] = tf.estimator.export.RegressionOutput( value=text_embedding) exports[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \ tf.estimator.export.ClassificationOutput(scores=probs) return tf.estimator.EstimatorSpec(mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics, export_outputs=exports)
def cross_feature(row, feature_name_1, feature_name_2, hash_size): row["cross_" + feature_name_1 + "_" + feature_name_2] = tf.one_hot( tf.string_to_hash_bucket( tf.string_join([ tf.cast(row[feature_name_1], tf.string), tf.cast(row[feature_name_2], tf.string) ]), hash_size), 2 * hash_size) return row
def testStringToHashBuckets(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket(input_string, 10) result = output.eval(feed_dict={input_string: ["a", "b", "c"]}) # Hash64('a') -> 2996632905371535868 -> mod 10 -> 8 # Hash64('b') -> 5795986006276551370 -> mod 10 -> 0 # Hash64('c') -> 14899841994519054197 -> mod 10 -> 7 self.assertAllEqual([8, 0, 7], result)
def testStringToHashBucketsLegacyHash(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket(input_string, 10) result = output.eval(feed_dict={input_string: ['a', 'b', 'c']}) # Hash64('a') -> 2996632905371535868 -> mod 10 -> 8 # Hash64('b') -> 5795986006276551370 -> mod 10 -> 0 # Hash64('c') -> 14899841994519054197 -> mod 10 -> 7 self.assertAllEqual([8, 0, 7], result)
def decode_node_attr(infos, hash_size_list, is_hash=False): fea_val_list = [ tf.decode_csv(info, [[" "], [" "]], ":")[1] for info in infos ] if is_hash: fea_hash_list = [ tf.string_to_hash_bucket(i, j) for (i, j) in zip(fea_val_list, hash_size_list) ] return fea_hash_list return fea_val_list
def testWithHash(self): parser_op = fm_ops.fm_parser(tf.constant(self.EXAMPLES, tf.string), self.VOCAB_SIZE, True) string_ids = [str(x) for x in self.TARGET_FEATURE_IDS] hashed_feature_ids = tf.string_to_hash_bucket(string_ids, self.VOCAB_SIZE) with self.test_session() as sess: hashed_ids = sess.run(hashed_feature_ids) labels, sizes, feature_ids, feature_vals = sess.run(parser_op) self.assertAllClose(labels, self.TARGET_LABELS) self.assertAllEqual(sizes, self.TARGET_SIZES) self.assertAllEqual(feature_ids, hashed_ids) self.assertAllClose(feature_vals, self.TARGET_FEATURE_VALS)
def decode_node_attr(infos, hash_size_list, is_hash=False): # decode arbitrary num of node attr, len(infos) can be arbitrary number # work for both user and item fea_val_list = [ tf.decode_csv(info, [[" "], [" "]], ":")[1] for info in infos ] if is_hash: fea_hash_list = [ tf.string_to_hash_bucket(i, j) for (i, j) in zip(fea_val_list, hash_size_list) ] return fea_hash_list return fea_val_list
def read_and_decode(filename_queue): reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ "label": tf.FixedLenFeature([], tf.float32), "categorical_features": tf.FixedLenFeature([CATEGORICAL_FEATURES_SIZE], tf.string), "continuous_features": tf.FixedLenFeature([CONTINUOUS_FEATURES_SIZE], tf.float32), }) label = features["label"] continuous_features = features["continuous_features"] categorical_features = tf.cast( tf.string_to_hash_bucket(features["categorical_features"], BUCKET_SIZE), tf.float32) return label, tf.concat(0, [continuous_features, categorical_features])
def decode_node_list_attr(infos, node_num, hash_size_list, is_hash=False): infos_list = tf.decode_csv(infos, [[" "]] * node_num, chr(3)) infos_fea_list = [ tf.decode_csv(i, [[' ']] * len(hash_size_list), '#') for i in infos_list ] infos_fea_val_list = [ decode_node_attr(node, hash_size_list, is_hash=False) for node in infos_fea_list ] return_list = [[] for i in range(len(hash_size_list))] for x in infos_fea_val_list: for idx, val in enumerate(hash_size_list): return_list[idx].append(x[idx]) if is_hash: return_hash_list = [ tf.string_to_hash_bucket(node, hash_size) for node, hash_size in zip(return_list, hash_size_list) ] return return_hash_list
# -*- coding:utf-8 -*- # @version: 1.0 # @author: wuxikun # @date: '2020/11/28 9:08 PM' import tensorflow as tf features = { 'sex': ['male', 'male', 'female', 'female', 'mid', 'man'], } sex_tensor = tf.constant(['male', 'male', 'female', 'female', 'mid', 'man'], dtype=tf.string) # sex_column = tf.feature_column.categorical_column_with_vocabulary_list('sex', ['male', 'female']) sex_column = tf.string_to_hash_bucket(sex_tensor, 10) # sex_column = tf.feature_column.indicator_column(sex_column) # columns = [sex_column] # # # inputs = tf.feature_column.input_layer(features, columns) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(init) v = sess.run(sex_column) print(v)
print("string expand_dim 0: ", sess.run(tf.expand_dims(string_tensor, dim=0))) # 相当于插入一维 print("string expand_dim -1: ", sess.run(tf.expand_dims(string_tensor, dim=-1))) # 相当于插入一维 print("sparse tensor: ", sess.run(chars)) print("sp values: ", sess.run(chars.values)) """ string : [b'hello world' b'a b c' b'hello hkx c'] string newaxis: [[b'hello world' b'a b c' b'hello hkx c']] string expand_dim 0: [[b'hello world' b'a b c' b'hello hkx c']] string expand_dim -1: [[b'hello world'] [b'a b c'] [b'hello hkx c']] sparse tensor: SparseTensorValue(indices=array([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]], dtype=int64), values=array([b'hello', b'world', b'a', b'b', b'c', b'hello', b'hkx', b'c'], dtype=object), dense_shape=array([3, 3], dtype=int64)) sp values: [b'hello' b'world' b'a' b'b' b'c' b'hello' b'hkx' b'c'] """ sess = tf.Session() string = ["hello world", "a b c", "hello hkx c"] string_bucket = tf.string_to_hash_bucket(string, num_buckets=1000) print("string_bucket: ", sess.run(string_bucket)) sess.close()
tf.reduce_join(a, -2) # ==> ["ac", "bd"] tf.reduce_join(a, -1) # ==> ["ab", "cd"] tf.reduce_join(a, 0, keep_dims=True) #==> [["ac", "bd"]] tf.reduce_join(a, 1, keep_dims=True) #==> [["ab"], ["cd"]] tf.reduce_join(a, 0, separator=".") #==> ["a.c", "b.d"] tf.reduce_join(a, [0, 1]) #==> ["acbd"] tf.reduce_join(a, [1, 0]) #==> ["abcd"] tf.reduce_join(a, []) #==> ["abcd"] b = tf.convert_to_tensor(["ac"]) c = tf.convert_to_tensor(["bd"]) d = tf.string_join([b, c], separator=" ", name=None) print(sess.run(d)) e = tf.reduce_join(a, 0) print(tf.string_to_hash_bucket(e, 2)) print(sess.run(tf.string_to_hash_bucket(e, 5))) f = tf.string_to_hash_bucket(e, 2) hw = tf.convert_to_tensor(["hello worls"]) print(sess.run(tf.string_split(hw, delimiter=' '))) ### Exercise modelue_1_4 #Create new string tensors with: # a) transform str_1 in a way to get [["name: ", "surname: "], ["Jan", "Idziak"]] # a')str_1 with argument ["name: Jan", "surname: Idziak"] # b) str_2 with argument[["helo ", "world"], ["tensor", "flow"]] # b') str_2 with argument ["helo world","tensorflow"] # c) Create simple string tensors with arguments: # c') str_3 - ["My name is:"]
def hash_one_hot(row, feature_name, hash_size): row[feature_name] = tf.one_hot( tf.string_to_hash_bucket(tf.cast(row[feature_name], tf.string), hash_size), hash_size) return row
# 第9节 字符串操作 import tensorflow as tf a = tf.constant('Hello,world!') b = tf.constant('I love tensorflow.') sess = tf.Session() # 计算哈希值 c = tf.string_to_hash_bucket_fast(a, 10000000) d = tf.string_to_hash_bucket_strong(a, 10000000, key=[1, 3]) e = tf.string_to_hash_bucket(a, 10000000) result = sess.run([c, d, e]) print('Hashing:\nfast:%s\nstrong:%s\nnormal:%s\n\n' % (result[0], result[1], result[2])) # 把数组连接为字符串 c = tf.reduce_join([a, b], axis=0) d = tf.string_join([a, b], '__') result = sess.run([c, d]) print('Joining:\nc=%s\nd=%s\n\n' % (result[0], result[1])) # 分割字符串 c = tf.string_split([a], ',') d = tf.substr(a, 0, 5) result = sess.run([c, d])