예제 #1
0
    def __init__(self, args):
        super(SkipThoughtsModel, self).__init__(args)
        self.uniform_initializer = tf.random_uniform_initializer(
            minval=-self.uniform_init_scale, maxval=self.uniform_init_scale)

        with open(args.vocab, 'r') as f:
            self.vocab = [l.strip() for l in f]
        # Set up input parsing stuff via tf.contrib.learn...
        self.vocab_size = len(self.vocab)
        encode = layers.sparse_column_with_integerized_feature(
            'encode', bucket_size=self.vocab_size)
        decode_pre = layers.sparse_column_with_integerized_feature(
            'decode_pre', bucket_size=self.vocab_size)
        decode_post = layers.sparse_column_with_integerized_feature(
            'decode_post', bucket_size=self.vocab_size)
        self.features = {
            'encode': layers.embedding_column(encode, dimension=100),
            'decode_pre': layers.embedding_column(decode_pre, dimension=100),
            'decode_post': layers.embedding_column(decode_post, dimension=100),
        }
        self.feature_spec = tf.contrib.layers.create_feature_spec_for_parsing(
            self.features),
        # ... or do it the easy way:
        self.features = {
            'encode': tf.VarLenFeature(dtype=tf.int64),
            'decode_pre': tf.VarLenFeature(dtype=tf.int64),
            'decode_post': tf.VarLenFeature(dtype=tf.int64),
        }
        self.feature_spec = self.features
예제 #2
0
INPUT_COLUMN_NAMES = {
    'dayofweek': tf.string,
    'hourofday': tf.int32,
    'pickuplon': tf.float32,
    'pickuplat': tf.float32,
    'dropofflon': tf.float32,
    'dropofflat': tf.float32,
    'passengers': tf.int32
}

# These are the raw input columns, and will be provided for prediction also
INPUT_COLUMNS = [
    # define features
    layers.sparse_column_with_keys(
        'dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']),
    layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24),

    # engineered features that are created in the input_fn
    layers.real_valued_column('latdiff'),
    layers.real_valued_column('londiff'),
    layers.real_valued_column('euclidean'),

    # real_valued_column
    layers.real_valued_column('pickuplon'),
    layers.real_valued_column('pickuplat'),
    layers.real_valued_column('dropofflat'),
    layers.real_valued_column('dropofflon'),
    layers.real_valued_column('passengers'),
]

for var in categorical_vars:
  le = LabelEncoder().fit(X_train[var])
  X_train[var + '_ids'] = le.transform(X_train[var])
  X_test[var + '_ids'] = le.transform(X_test[var])
  X_train.pop(var)
  X_test.pop(var)
  categorical_var_encoders[var] = le

### Note: Feature Columns currently (2016/10/22) not working, update is coming.
# Setup feature columns.
CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable.
feature_columns = [
  layers.real_valued_column(var) for var in continues_vars
] + [
  layers.embedding_column(
     layers.sparse_column_with_integerized_feature(
       var + '_ids', len(categorical_var_encoders[var].classes_)), 
     CATEGORICAL_EMBED_SIZE) for var in
  categorical_vars
]


# Linear classifier.
'''
random.seed(42)
tflr = learn.LinearClassifier(n_classes=2,
    feature_columns=feature_columns,
    optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
tflr.fit(input_fn=train_input_fn, steps=500)
print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test)
print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True))))
'''
예제 #4
0
def gen_feature(feature_conf):
    name = feature_conf[feature_name_key]
    value_type = feature_conf[value_type_key]

    if "vocab_size" in feature_conf:
        id_feature = fc.sparse_column_with_keys(
            column_name=name,
            keys=range(feature_conf['vocab_size']),
            dtype=tf.string)

        return fc._EmbeddingColumn(
            id_feature,
            dimension=feature_conf['embedding_dimension'],
            shared_embedding_name=feature_conf.get(feature_name_key),
        )
    elif "hash_bucket_size" in feature_conf \
            and "embedding_dimension" not in feature_conf:
        if value_type == "Int":
            id_feature = layers.sparse_column_with_integerized_feature(
                column_name=name,
                bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        return id_feature
    elif "embedding_dimension" in feature_conf \
            and "hash_bucket_size" in feature_conf \
            and "boundaries" not in feature_conf \
            and "vocabulary_file" not in feature_conf:
        if value_type == "Int":
            return _EmbeddingColumn(
                sparse_id_column=layers.sparse_column_with_integerized_feature(
                    column_name=name,
                    bucket_size=feature_conf['hash_bucket_size'],
                    combiner=_get_combiner(feature_conf),
                    # use_hashmap=use_hashmap
                ),
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None))
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf:
        use_hashmap = feature_conf.get("use_hashmap", False)
        if value_type == "Int":
            raise Exception(
                "embedding with vocabulary_file does not support Int type")
        else:
            id_feature = fc.sparse_column_with_vocabulary_file(
                column_name=name,
                vocabulary_file=feature_conf["vocabulary_file"],
                num_oov_buckets=feature_conf["num_oov_buckets"],
                vocab_size=feature_conf["vocab_size"],
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" in feature_conf:
        return embedding_bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ],
            embedding_dimension=feature_conf["embedding_dimension"],
            max_norm=None,
            shared_name=feature_conf.get('shared_name', None),
            add_random=feature_conf.get('add_random', False))
    elif "embedding_dimension" not in feature_conf \
            and "boundaries" in feature_conf:
        return layers.bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ])
    else:
        return layers.real_valued_column(
            column_name=name,
            dimension=feature_conf.get('dimension', 1),
            default_value=[
                0.0 for _ in range(int(feature_conf.get('dimension', 1)))
            ],
            normalizer=None if 'l2_norm' not in feature_conf else
            lambda x: tf.nn.l2_normalize(x, dim=-1))
예제 #5
0
from tensorflow.contrib import metrics
import numpy as np

tf.logging.set_verbosity(tf.logging.INFO)

CSV_COLUMNS = 'fare_amount,dayofweek,hourofday,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key'.split(',')
SCALE_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers']
LABEL_COLUMN = 'fare_amount'
KEY_FEATURE_COLUMN = 'key'
DEFAULTS = [[0.0], ['Sun'], [0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']]

# These are the raw input columns, and will be provided for prediction also
INPUT_COLUMNS = [
    # define features
    layers.sparse_column_with_keys('dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']),
    layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24),

    # engineered features that are created in the input_fn
    layers.real_valued_column('latdiff'),
    layers.real_valued_column('londiff'),
    layers.real_valued_column('euclidean'),

    # real_valued_column
    layers.real_valued_column('pickuplon'),
    layers.real_valued_column('pickuplat'),
    layers.real_valued_column('dropofflat'),
    layers.real_valued_column('dropofflon'),
    layers.real_valued_column('passengers'),
]

def build_estimator(model_dir, nbuckets, hidden_units):
def get_input(pattern, features, shuffle=False):
    examples = tf.contrib.learn.io.read_keyed_batch_features(
        pattern,
        batch_size=1,
        features=features,
        randomize_input=shuffle,
        reader=tf.TFRecordReader)
    return examples


vocab = Vocab(base_dir + '/vocab.txt')
vocab_size = len(vocab)

if tf_layers:
    encode = layers.sparse_column_with_integerized_feature(
        'encode', bucket_size=vocab_size)
    decode_pre = layers.sparse_column_with_integerized_feature(
        'decode_pre', bucket_size=vocab_size)
    decode_post = layers.sparse_column_with_integerized_feature(
        'decode_post', bucket_size=vocab_size)
    features = {
        'encode': encode,
        'encode_emb': layers.embedding_column(encode, dimension=100),
        'decode_pre': layers.embedding_column(decode_pre, dimension=100),
        'decode_post': layers.embedding_column(decode_post, dimension=100),
    }
    features = tf.contrib.layers.create_feature_spec_for_parsing(features)
else:
    # This little dict seems equivalent to the waay more verbose
    # tf.contrib.layers approach. But apparently the latter helps,
    # especially when it comes to tf serving. Still to see the benefit...
예제 #7
0
def build_estimator(model_dir=MODEL_DIR):
    """
    Build an estimator using
    CONTINTUOUS_COLUMNS, BINARY_COLUMNS and MULTI_CATEGORY_COLUMNS.
    """
    bucketized_columns = \
        [sparse_column_with_hash_bucket(col, 1000)
         for col in MULTI_CATEGORY_COLUMNS] + \
        [sparse_column_with_integerized_feature(col, bucket_size=2)
         for col in BINARY_COLUMNS]

    real_valued_columns = \
        [real_valued_column(col) for col in CONTINUOUS_COLUMNS]

    crossed_columns = \
        []

    # Wide columns and deep columns.
    wide_columns = \
        bucketized_columns + \
        real_valued_columns + \
        crossed_columns

    # embedding columns for hash_bucket columns
    deep_columns = \
        [embedding_column(col, dimension=EMBEDDING_DIMENSION)
         for col in bucketized_columns] + \
        real_valued_columns + \
        crossed_columns

    if MODEL_TYPE == "wide":
        print('Creating wide LinearClassifier model...\n')
        model = tf.contrib.learn.LinearClassifier(
            model_dir=model_dir,
            n_classes=2,
            feature_columns=wide_columns,
            # optimizer=tf.train.GradientDescentOptimizer(
            #     learning_rate=FLAGS.learn_rate)
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )

    elif MODEL_TYPE == "deep":
        print('Creating deep DNNClassifier model...\n')
        model = tf.contrib.learn.DNNClassifier(
            model_dir=model_dir,
            n_classes=2,
            feature_columns=deep_columns,
            hidden_units=HIDDEN_UNITS,
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )
    else:
        print('Creating deepNwide DNNLinearCombinedClassifier model...\n')
        model = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            n_classes=2,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=HIDDEN_UNITS,
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )

    return model
예제 #8
0
def build_estimator(model_dir, model_type):
    """Build an estimator."""
    # Sparse base columns.
    userID = layers.sparse_column_with_integerized_feature('userID', 2805118)
    creativeID = layers.sparse_column_with_integerized_feature(
        'creativeID', 6582)
    positionID = layers.sparse_column_with_integerized_feature(
        'positionID', 7645)
    adID = layers.sparse_column_with_integerized_feature('adID', 3616)
    camgaignID = layers.sparse_column_with_integerized_feature(
        'camgaignID', 720)
    advertiserID = layers.sparse_column_with_integerized_feature(
        'advertiserID', 91)
    appID = layers.sparse_column_with_integerized_feature('appID', 50)
    sitesetID = layers.sparse_column_with_integerized_feature('sitesetID', 3)
    appCategory = layers.sparse_column_with_integerized_feature(
        'appCategory', 14)
    appPlatform = layers.sparse_column_with_integerized_feature(
        'appPlatform', 2)
    education = layers.sparse_column_with_integerized_feature('education', 8)
    gender = layers.sparse_column_with_integerized_feature('gender', 3)
    haveBaby = layers.sparse_column_with_integerized_feature('haveBaby', 7)
    marriageStatus = layers.sparse_column_with_integerized_feature(
        'marriageStatus', 4)
    positionType = layers.sparse_column_with_integerized_feature(
        'positionType', 6)
    hometown_c = layers.sparse_column_with_integerized_feature(
        'hometown_c', 22)
    hometown_p = layers.sparse_column_with_integerized_feature(
        'hometown_p', 35)
    residence_c = layers.sparse_column_with_integerized_feature(
        'residence_c', 22)
    residence_p = layers.sparse_column_with_integerized_feature(
        'residence_p', 35)
    telecomsOperator = layers.sparse_column_with_integerized_feature(
        'telecomsOperator', 4)
    connectionType = layers.sparse_column_with_integerized_feature(
        'connectionType', 5)
    clickTime_week = layers.sparse_column_with_integerized_feature(
        'clickTime_week', 7)

    # Continuous base columns.
    age = layers.real_valued_column("age")
    inst_app_installed = layers.real_valued_column('inst_app_installed')
    inst_cate_percent = layers.real_valued_column('inst_cate_percent')
    inst_cnt_appcate = layers.real_valued_column('inst_cnt_appcate')
    inst_cnt_installed = layers.real_valued_column('inst_cnt_installed')
    inst_is_installed = layers.real_valued_column('inst_is_installed')
    action_cate = layers.real_valued_column('action_cate')
    action_cate_recent = layers.real_valued_column('action_cate_recent')
    action_installed = layers.real_valued_column('action_installed')
    tt_cnt_appcate = layers.real_valued_column('tt_cnt_appcate')
    tt_is_installed = layers.real_valued_column('tt_is_installed')
    clickTime_day = layers.real_valued_column('clickTime_day')
    clickTime_hour = layers.real_valued_column('clickTime_hour')
    clickTime_minute = layers.real_valued_column('clickTime_minute')

    # Transformations.
    age_buckets = layers.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    inst_app_installed_buckets = layers.bucketized_column(
        inst_app_installed,
        boundaries=[1000, 5000, 10000, 50000, 100000, 500000])
    clickTime_hour_buckets = layers.bucketized_column(
        clickTime_hour, boundaries=[8, 11, 14, 17, 19, 22])

    # Wide columns and deep columns.
    wide_columns = [
        userID,
        creativeID,
        positionID,
        adID,
        camgaignID,
        advertiserID,
        appID,
        sitesetID,
        appCategory,
        appPlatform,
        education,
        gender,
        haveBaby,
        marriageStatus,
        positionType,
        hometown_c,
        hometown_p,
        residence_c,
        residence_p,
        telecomsOperator,
        connectionType,
        clickTime_week,

        # layers.embedding_column(userID, dimension=8),
        # layers.embedding_column(creativeID, dimension=8),
        # layers.embedding_column(positionID, dimension=8),
        # layers.embedding_column(adID, dimension=8),
        # layers.embedding_column(camgaignID, dimension=8),
        # layers.embedding_column(advertiserID, dimension=8),
        # layers.embedding_column(appID, dimension=8),
        # layers.embedding_column(sitesetID, dimension=8),
        # layers.embedding_column(appCategory, dimension=8),
        # layers.embedding_column(appPlatform, dimension=8),
        # layers.embedding_column(education, dimension=8),
        # layers.embedding_column(gender, dimension=8),
        # layers.embedding_column(haveBaby, dimension=8),
        # layers.embedding_column(marriageStatus, dimension=8),
        # layers.embedding_column(positionType, dimension=8),
        # layers.embedding_column(hometown_c, dimension=8),
        # layers.embedding_column(hometown_p, dimension=8),
        # layers.embedding_column(residence_c, dimension=8),
        # layers.embedding_column(residence_p, dimension=8),
        # layers.embedding_column(telecomsOperator, dimension=8),
        # layers.embedding_column(connectionType, dimension=8),
        # layers.embedding_column(clickTime_week, dimension=8),
        # layers.one_hot_column(userID),
        # layers.one_hot_column(creativeID),
        # layers.one_hot_column(positionID),
        # layers.one_hot_column(adID),
        # layers.one_hot_column(camgaignID),
        # layers.one_hot_column(advertiserID),
        # layers.one_hot_column(appID),
        # layers.one_hot_column(sitesetID),
        # layers.one_hot_column(appCategory),
        # layers.one_hot_column(appPlatform),
        # layers.one_hot_column(education),
        # layers.one_hot_column(gender),
        # layers.one_hot_column(haveBaby),
        # layers.one_hot_column(marriageStatus),
        # layers.one_hot_column(positionType),
        # layers.one_hot_column(hometown_c),
        # layers.one_hot_column(hometown_p),
        # layers.one_hot_column(residence_c),
        # layers.one_hot_column(residence_p),
        # layers.one_hot_column(telecomsOperator),
        # layers.one_hot_column(connectionType),
        # layers.one_hot_column(clickTime_week),
        age_buckets,
        clickTime_hour_buckets,
        inst_app_installed_buckets,
    ]

    deep_columns = [
        layers.embedding_column(userID, dimension=8),
        layers.embedding_column(creativeID, dimension=8),
        layers.embedding_column(positionID, dimension=8),
        layers.embedding_column(adID, dimension=8),
        layers.embedding_column(camgaignID, dimension=8),
        layers.embedding_column(advertiserID, dimension=8),
        layers.embedding_column(appID, dimension=8),
        layers.embedding_column(sitesetID, dimension=8),
        layers.embedding_column(appCategory, dimension=8),
        layers.embedding_column(appPlatform, dimension=8),
        layers.embedding_column(education, dimension=8),
        layers.embedding_column(gender, dimension=8),
        layers.embedding_column(haveBaby, dimension=8),
        layers.embedding_column(marriageStatus, dimension=8),
        layers.embedding_column(positionType, dimension=8),
        layers.embedding_column(hometown_c, dimension=8),
        layers.embedding_column(hometown_p, dimension=8),
        layers.embedding_column(residence_c, dimension=8),
        layers.embedding_column(residence_p, dimension=8),
        layers.embedding_column(telecomsOperator, dimension=8),
        layers.embedding_column(connectionType, dimension=8),
        layers.embedding_column(clickTime_week, dimension=8),
        age,
        action_cate,
        action_cate_recent,
        action_installed,
        inst_app_installed,
        inst_cate_percent,
        inst_cnt_appcate,
        inst_cnt_installed,
        inst_is_installed,
        tt_cnt_appcate,
        tt_is_installed,
        clickTime_day,
        clickTime_hour,
        clickTime_minute,
    ]

    if model_type == "wide":
        m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
                                              feature_columns=wide_columns)
    elif model_type == "deep":
        m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
                                           feature_columns=deep_columns,
                                           hidden_units=[100, 50])
    else:
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100, 50, 1],
            fix_global_step_increment_bug=True)
    return m
for var in categorical_vars:
    le = LabelEncoder().fit(X_train[var])
    X_train[var + '_ids'] = le.transform(X_train[var])
    X_test[var + '_ids'] = le.transform(X_test[var])
    X_train.pop(var)
    X_test.pop(var)
    categorical_var_encoders[var] = le

### Note: Feature Columns currently (2016/10/22) not working, update is coming.
# Setup feature columns.
CATEGORICAL_EMBED_SIZE = 10  # Note, you can customize this per variable.
feature_columns = [layers.real_valued_column(var)
                   for var in continues_vars] + [
                       layers.embedding_column(
                           layers.sparse_column_with_integerized_feature(
                               var + '_ids',
                               len(categorical_var_encoders[var].classes_)),
                           CATEGORICAL_EMBED_SIZE) for var in categorical_vars
                   ]

# Linear classifier.
'''
random.seed(42)
tflr = learn.LinearClassifier(n_classes=2,
    feature_columns=feature_columns,
    optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
tflr.fit(input_fn=train_input_fn, steps=500)
print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test)
print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True))))
'''