def __init__(self, args): super(SkipThoughtsModel, self).__init__(args) self.uniform_initializer = tf.random_uniform_initializer( minval=-self.uniform_init_scale, maxval=self.uniform_init_scale) with open(args.vocab, 'r') as f: self.vocab = [l.strip() for l in f] # Set up input parsing stuff via tf.contrib.learn... self.vocab_size = len(self.vocab) encode = layers.sparse_column_with_integerized_feature( 'encode', bucket_size=self.vocab_size) decode_pre = layers.sparse_column_with_integerized_feature( 'decode_pre', bucket_size=self.vocab_size) decode_post = layers.sparse_column_with_integerized_feature( 'decode_post', bucket_size=self.vocab_size) self.features = { 'encode': layers.embedding_column(encode, dimension=100), 'decode_pre': layers.embedding_column(decode_pre, dimension=100), 'decode_post': layers.embedding_column(decode_post, dimension=100), } self.feature_spec = tf.contrib.layers.create_feature_spec_for_parsing( self.features), # ... or do it the easy way: self.features = { 'encode': tf.VarLenFeature(dtype=tf.int64), 'decode_pre': tf.VarLenFeature(dtype=tf.int64), 'decode_post': tf.VarLenFeature(dtype=tf.int64), } self.feature_spec = self.features
INPUT_COLUMN_NAMES = { 'dayofweek': tf.string, 'hourofday': tf.int32, 'pickuplon': tf.float32, 'pickuplat': tf.float32, 'dropofflon': tf.float32, 'dropofflat': tf.float32, 'passengers': tf.int32 } # These are the raw input columns, and will be provided for prediction also INPUT_COLUMNS = [ # define features layers.sparse_column_with_keys( 'dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']), layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24), # engineered features that are created in the input_fn layers.real_valued_column('latdiff'), layers.real_valued_column('londiff'), layers.real_valued_column('euclidean'), # real_valued_column layers.real_valued_column('pickuplon'), layers.real_valued_column('pickuplat'), layers.real_valued_column('dropofflat'), layers.real_valued_column('dropofflon'), layers.real_valued_column('passengers'), ]
for var in categorical_vars: le = LabelEncoder().fit(X_train[var]) X_train[var + '_ids'] = le.transform(X_train[var]) X_test[var + '_ids'] = le.transform(X_test[var]) X_train.pop(var) X_test.pop(var) categorical_var_encoders[var] = le ### Note: Feature Columns currently (2016/10/22) not working, update is coming. # Setup feature columns. CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable. feature_columns = [ layers.real_valued_column(var) for var in continues_vars ] + [ layers.embedding_column( layers.sparse_column_with_integerized_feature( var + '_ids', len(categorical_var_encoders[var].classes_)), CATEGORICAL_EMBED_SIZE) for var in categorical_vars ] # Linear classifier. ''' random.seed(42) tflr = learn.LinearClassifier(n_classes=2, feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) tflr.fit(input_fn=train_input_fn, steps=500) print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test) print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True)))) '''
def gen_feature(feature_conf): name = feature_conf[feature_name_key] value_type = feature_conf[value_type_key] if "vocab_size" in feature_conf: id_feature = fc.sparse_column_with_keys( column_name=name, keys=range(feature_conf['vocab_size']), dtype=tf.string) return fc._EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], shared_embedding_name=feature_conf.get(feature_name_key), ) elif "hash_bucket_size" in feature_conf \ and "embedding_dimension" not in feature_conf: if value_type == "Int": id_feature = layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ) return id_feature elif "embedding_dimension" in feature_conf \ and "hash_bucket_size" in feature_conf \ and "boundaries" not in feature_conf \ and "vocabulary_file" not in feature_conf: if value_type == "Int": return _EmbeddingColumn( sparse_id_column=layers.sparse_column_with_integerized_feature( column_name=name, bucket_size=feature_conf['hash_bucket_size'], combiner=_get_combiner(feature_conf), # use_hashmap=use_hashmap ), dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None)) else: id_feature = layers.sparse_column_with_hash_bucket( column_name=name, hash_bucket_size=feature_conf['hash_bucket_size'], # use_hashmap=use_hashmap ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf: use_hashmap = feature_conf.get("use_hashmap", False) if value_type == "Int": raise Exception( "embedding with vocabulary_file does not support Int type") else: id_feature = fc.sparse_column_with_vocabulary_file( column_name=name, vocabulary_file=feature_conf["vocabulary_file"], num_oov_buckets=feature_conf["num_oov_buckets"], vocab_size=feature_conf["vocab_size"], ) return _EmbeddingColumn( id_feature, dimension=feature_conf['embedding_dimension'], combiner=_get_combiner(feature_conf), shared_embedding_name=feature_conf.get('shared_name', None), max_norm=None) elif "embedding_dimension" in feature_conf \ and "boundaries" in feature_conf: return embedding_bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ], embedding_dimension=feature_conf["embedding_dimension"], max_norm=None, shared_name=feature_conf.get('shared_name', None), add_random=feature_conf.get('add_random', False)) elif "embedding_dimension" not in feature_conf \ and "boundaries" in feature_conf: return layers.bucketized_column( layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ]), boundaries=[ float(b) for b in feature_conf['boundaries'].split(',') ]) else: return layers.real_valued_column( column_name=name, dimension=feature_conf.get('dimension', 1), default_value=[ 0.0 for _ in range(int(feature_conf.get('dimension', 1))) ], normalizer=None if 'l2_norm' not in feature_conf else lambda x: tf.nn.l2_normalize(x, dim=-1))
from tensorflow.contrib import metrics import numpy as np tf.logging.set_verbosity(tf.logging.INFO) CSV_COLUMNS = 'fare_amount,dayofweek,hourofday,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key'.split(',') SCALE_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers'] LABEL_COLUMN = 'fare_amount' KEY_FEATURE_COLUMN = 'key' DEFAULTS = [[0.0], ['Sun'], [0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']] # These are the raw input columns, and will be provided for prediction also INPUT_COLUMNS = [ # define features layers.sparse_column_with_keys('dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']), layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24), # engineered features that are created in the input_fn layers.real_valued_column('latdiff'), layers.real_valued_column('londiff'), layers.real_valued_column('euclidean'), # real_valued_column layers.real_valued_column('pickuplon'), layers.real_valued_column('pickuplat'), layers.real_valued_column('dropofflat'), layers.real_valued_column('dropofflon'), layers.real_valued_column('passengers'), ] def build_estimator(model_dir, nbuckets, hidden_units):
def get_input(pattern, features, shuffle=False): examples = tf.contrib.learn.io.read_keyed_batch_features( pattern, batch_size=1, features=features, randomize_input=shuffle, reader=tf.TFRecordReader) return examples vocab = Vocab(base_dir + '/vocab.txt') vocab_size = len(vocab) if tf_layers: encode = layers.sparse_column_with_integerized_feature( 'encode', bucket_size=vocab_size) decode_pre = layers.sparse_column_with_integerized_feature( 'decode_pre', bucket_size=vocab_size) decode_post = layers.sparse_column_with_integerized_feature( 'decode_post', bucket_size=vocab_size) features = { 'encode': encode, 'encode_emb': layers.embedding_column(encode, dimension=100), 'decode_pre': layers.embedding_column(decode_pre, dimension=100), 'decode_post': layers.embedding_column(decode_post, dimension=100), } features = tf.contrib.layers.create_feature_spec_for_parsing(features) else: # This little dict seems equivalent to the waay more verbose # tf.contrib.layers approach. But apparently the latter helps, # especially when it comes to tf serving. Still to see the benefit...
def build_estimator(model_dir=MODEL_DIR): """ Build an estimator using CONTINTUOUS_COLUMNS, BINARY_COLUMNS and MULTI_CATEGORY_COLUMNS. """ bucketized_columns = \ [sparse_column_with_hash_bucket(col, 1000) for col in MULTI_CATEGORY_COLUMNS] + \ [sparse_column_with_integerized_feature(col, bucket_size=2) for col in BINARY_COLUMNS] real_valued_columns = \ [real_valued_column(col) for col in CONTINUOUS_COLUMNS] crossed_columns = \ [] # Wide columns and deep columns. wide_columns = \ bucketized_columns + \ real_valued_columns + \ crossed_columns # embedding columns for hash_bucket columns deep_columns = \ [embedding_column(col, dimension=EMBEDDING_DIMENSION) for col in bucketized_columns] + \ real_valued_columns + \ crossed_columns if MODEL_TYPE == "wide": print('Creating wide LinearClassifier model...\n') model = tf.contrib.learn.LinearClassifier( model_dir=model_dir, n_classes=2, feature_columns=wide_columns, # optimizer=tf.train.GradientDescentOptimizer( # learning_rate=FLAGS.learn_rate) # optimizer=tf.train.FtrlOptimizer( # learning_rate=LEARN_RATE, # l1_regularization_strength=0.0, # l2_regularization_strength=0.0), ) elif MODEL_TYPE == "deep": print('Creating deep DNNClassifier model...\n') model = tf.contrib.learn.DNNClassifier( model_dir=model_dir, n_classes=2, feature_columns=deep_columns, hidden_units=HIDDEN_UNITS, # optimizer=tf.train.FtrlOptimizer( # learning_rate=LEARN_RATE, # l1_regularization_strength=0.0, # l2_regularization_strength=0.0), ) else: print('Creating deepNwide DNNLinearCombinedClassifier model...\n') model = tf.contrib.learn.DNNLinearCombinedClassifier( model_dir=model_dir, n_classes=2, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=HIDDEN_UNITS, # optimizer=tf.train.FtrlOptimizer( # learning_rate=LEARN_RATE, # l1_regularization_strength=0.0, # l2_regularization_strength=0.0), ) return model
def build_estimator(model_dir, model_type): """Build an estimator.""" # Sparse base columns. userID = layers.sparse_column_with_integerized_feature('userID', 2805118) creativeID = layers.sparse_column_with_integerized_feature( 'creativeID', 6582) positionID = layers.sparse_column_with_integerized_feature( 'positionID', 7645) adID = layers.sparse_column_with_integerized_feature('adID', 3616) camgaignID = layers.sparse_column_with_integerized_feature( 'camgaignID', 720) advertiserID = layers.sparse_column_with_integerized_feature( 'advertiserID', 91) appID = layers.sparse_column_with_integerized_feature('appID', 50) sitesetID = layers.sparse_column_with_integerized_feature('sitesetID', 3) appCategory = layers.sparse_column_with_integerized_feature( 'appCategory', 14) appPlatform = layers.sparse_column_with_integerized_feature( 'appPlatform', 2) education = layers.sparse_column_with_integerized_feature('education', 8) gender = layers.sparse_column_with_integerized_feature('gender', 3) haveBaby = layers.sparse_column_with_integerized_feature('haveBaby', 7) marriageStatus = layers.sparse_column_with_integerized_feature( 'marriageStatus', 4) positionType = layers.sparse_column_with_integerized_feature( 'positionType', 6) hometown_c = layers.sparse_column_with_integerized_feature( 'hometown_c', 22) hometown_p = layers.sparse_column_with_integerized_feature( 'hometown_p', 35) residence_c = layers.sparse_column_with_integerized_feature( 'residence_c', 22) residence_p = layers.sparse_column_with_integerized_feature( 'residence_p', 35) telecomsOperator = layers.sparse_column_with_integerized_feature( 'telecomsOperator', 4) connectionType = layers.sparse_column_with_integerized_feature( 'connectionType', 5) clickTime_week = layers.sparse_column_with_integerized_feature( 'clickTime_week', 7) # Continuous base columns. age = layers.real_valued_column("age") inst_app_installed = layers.real_valued_column('inst_app_installed') inst_cate_percent = layers.real_valued_column('inst_cate_percent') inst_cnt_appcate = layers.real_valued_column('inst_cnt_appcate') inst_cnt_installed = layers.real_valued_column('inst_cnt_installed') inst_is_installed = layers.real_valued_column('inst_is_installed') action_cate = layers.real_valued_column('action_cate') action_cate_recent = layers.real_valued_column('action_cate_recent') action_installed = layers.real_valued_column('action_installed') tt_cnt_appcate = layers.real_valued_column('tt_cnt_appcate') tt_is_installed = layers.real_valued_column('tt_is_installed') clickTime_day = layers.real_valued_column('clickTime_day') clickTime_hour = layers.real_valued_column('clickTime_hour') clickTime_minute = layers.real_valued_column('clickTime_minute') # Transformations. age_buckets = layers.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) inst_app_installed_buckets = layers.bucketized_column( inst_app_installed, boundaries=[1000, 5000, 10000, 50000, 100000, 500000]) clickTime_hour_buckets = layers.bucketized_column( clickTime_hour, boundaries=[8, 11, 14, 17, 19, 22]) # Wide columns and deep columns. wide_columns = [ userID, creativeID, positionID, adID, camgaignID, advertiserID, appID, sitesetID, appCategory, appPlatform, education, gender, haveBaby, marriageStatus, positionType, hometown_c, hometown_p, residence_c, residence_p, telecomsOperator, connectionType, clickTime_week, # layers.embedding_column(userID, dimension=8), # layers.embedding_column(creativeID, dimension=8), # layers.embedding_column(positionID, dimension=8), # layers.embedding_column(adID, dimension=8), # layers.embedding_column(camgaignID, dimension=8), # layers.embedding_column(advertiserID, dimension=8), # layers.embedding_column(appID, dimension=8), # layers.embedding_column(sitesetID, dimension=8), # layers.embedding_column(appCategory, dimension=8), # layers.embedding_column(appPlatform, dimension=8), # layers.embedding_column(education, dimension=8), # layers.embedding_column(gender, dimension=8), # layers.embedding_column(haveBaby, dimension=8), # layers.embedding_column(marriageStatus, dimension=8), # layers.embedding_column(positionType, dimension=8), # layers.embedding_column(hometown_c, dimension=8), # layers.embedding_column(hometown_p, dimension=8), # layers.embedding_column(residence_c, dimension=8), # layers.embedding_column(residence_p, dimension=8), # layers.embedding_column(telecomsOperator, dimension=8), # layers.embedding_column(connectionType, dimension=8), # layers.embedding_column(clickTime_week, dimension=8), # layers.one_hot_column(userID), # layers.one_hot_column(creativeID), # layers.one_hot_column(positionID), # layers.one_hot_column(adID), # layers.one_hot_column(camgaignID), # layers.one_hot_column(advertiserID), # layers.one_hot_column(appID), # layers.one_hot_column(sitesetID), # layers.one_hot_column(appCategory), # layers.one_hot_column(appPlatform), # layers.one_hot_column(education), # layers.one_hot_column(gender), # layers.one_hot_column(haveBaby), # layers.one_hot_column(marriageStatus), # layers.one_hot_column(positionType), # layers.one_hot_column(hometown_c), # layers.one_hot_column(hometown_p), # layers.one_hot_column(residence_c), # layers.one_hot_column(residence_p), # layers.one_hot_column(telecomsOperator), # layers.one_hot_column(connectionType), # layers.one_hot_column(clickTime_week), age_buckets, clickTime_hour_buckets, inst_app_installed_buckets, ] deep_columns = [ layers.embedding_column(userID, dimension=8), layers.embedding_column(creativeID, dimension=8), layers.embedding_column(positionID, dimension=8), layers.embedding_column(adID, dimension=8), layers.embedding_column(camgaignID, dimension=8), layers.embedding_column(advertiserID, dimension=8), layers.embedding_column(appID, dimension=8), layers.embedding_column(sitesetID, dimension=8), layers.embedding_column(appCategory, dimension=8), layers.embedding_column(appPlatform, dimension=8), layers.embedding_column(education, dimension=8), layers.embedding_column(gender, dimension=8), layers.embedding_column(haveBaby, dimension=8), layers.embedding_column(marriageStatus, dimension=8), layers.embedding_column(positionType, dimension=8), layers.embedding_column(hometown_c, dimension=8), layers.embedding_column(hometown_p, dimension=8), layers.embedding_column(residence_c, dimension=8), layers.embedding_column(residence_p, dimension=8), layers.embedding_column(telecomsOperator, dimension=8), layers.embedding_column(connectionType, dimension=8), layers.embedding_column(clickTime_week, dimension=8), age, action_cate, action_cate_recent, action_installed, inst_app_installed, inst_cate_percent, inst_cnt_appcate, inst_cnt_installed, inst_is_installed, tt_cnt_appcate, tt_is_installed, clickTime_day, clickTime_hour, clickTime_minute, ] if model_type == "wide": m = tf.contrib.learn.LinearClassifier(model_dir=model_dir, feature_columns=wide_columns) elif model_type == "deep": m = tf.contrib.learn.DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=[100, 50]) else: m = tf.contrib.learn.DNNLinearCombinedClassifier( model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[100, 50, 1], fix_global_step_increment_bug=True) return m
for var in categorical_vars: le = LabelEncoder().fit(X_train[var]) X_train[var + '_ids'] = le.transform(X_train[var]) X_test[var + '_ids'] = le.transform(X_test[var]) X_train.pop(var) X_test.pop(var) categorical_var_encoders[var] = le ### Note: Feature Columns currently (2016/10/22) not working, update is coming. # Setup feature columns. CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable. feature_columns = [layers.real_valued_column(var) for var in continues_vars] + [ layers.embedding_column( layers.sparse_column_with_integerized_feature( var + '_ids', len(categorical_var_encoders[var].classes_)), CATEGORICAL_EMBED_SIZE) for var in categorical_vars ] # Linear classifier. ''' random.seed(42) tflr = learn.LinearClassifier(n_classes=2, feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) tflr.fit(input_fn=train_input_fn, steps=500) print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test) print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True)))) '''