Exemplo n.º 1
0
def transform(inputs, num_cols, cat_cols):
    print("Inputs before features transformation: {}".format(inputs.keys()))

    # Pass-through columns
    transformed = inputs.copy()

    feature_columns = {
        colname: tf.feature_column.numeric_column(colname)
        for colname in num_cols
    }

    # Add Euclidean distance
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickuplon'], inputs['pickuplat'], inputs['dropofflon'],
        inputs['dropofflat']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # Shift 'dayofweek' feature to a value range of 0-6

    transformed['dayofweek'] = transformed['dayofweek'] - 1

    # Create categorical columns (wrapped in indicator columns)

    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', 24))
    feature_columns['dayofweek'] = fc.indicator_column(
        fc.categorical_column_with_identity('dayofweek', 7))

    print("Transformed features: {}".format(transformed.keys()))
    print("Feature columns: {}".format(feature_columns.keys()))
    return transformed, feature_columns
Exemplo n.º 2
0
 def generateFeatureColumn(self):
     for columnName in self.featureList:
         if columnName != "prognosis":
             self.featureColumn.append(
                 feature_column.categorical_column_with_identity(
                     key=columnName, num_buckets=2))
         else:
             self.featureColumn.append(
                 (feature_column.categorical_column_with_identity(
                     key=columnName, num_buckets=41)))
Exemplo n.º 3
0
def create_user_feature_columns():
  gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
  age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0))
  has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0))
  baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0))
  baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0))
  grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0))
  rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0))
  cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0))
  cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0))
  cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0))
  city_id = fc.categorical_column_with_hash_bucket("city", 700)
  city = fc.shared_embedding_columns([city_id], 16)
  cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer]
  return cols + city
Exemplo n.º 4
0
def build_features(statistics):
    pu_location_id = fc.categorical_column_with_identity(key='PULocationID',
                                                         num_buckets=265)
    do_location_id = fc.categorical_column_with_identity(key='DOLocationID',
                                                         num_buckets=265)
    day_of_week = fc.categorical_column_with_identity(key='day_of_week',
                                                      num_buckets=7)
    weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2)
    speed_buckets = fc.bucketized_column(
        fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70])
    distance_buckets = fc.bucketized_column(
        fc.numeric_column('trip_distance'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    duration_buckets = fc.bucketized_column(
        fc.numeric_column('duration'),
        boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])
    fare_buckets = fc.bucketized_column(
        fc.numeric_column('fare_amount'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    passenger_buckets = fc.bucketized_column(
        fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9])
    location = fc.crossed_column([pu_location_id, do_location_id],
                                 hash_bucket_size=1000)
    cross_all = fc.crossed_column([
        location, speed_buckets, distance_buckets, duration_buckets,
        fare_buckets, passenger_buckets
    ],
                                  hash_bucket_size=1000)
    categorical_columns = [
        fc.embedding_column(pu_location_id, dimension=32),
        fc.embedding_column(do_location_id, dimension=32),
        fc.indicator_column(day_of_week),
        fc.indicator_column(weekend)
    ]
    numeric_columns = [
        custom_numeric_column('passenger_count', statistics),
        custom_numeric_column('trip_distance', statistics),
        custom_numeric_column('fare_amount', statistics),
        custom_numeric_column('extra', statistics),
        custom_numeric_column('mta_tax', statistics),
        custom_numeric_column('tolls_amount', statistics),
        custom_numeric_column('improvement_surcharge', statistics),
        custom_numeric_column('duration', statistics),
        custom_numeric_column('speed', statistics)
    ]
    dnn_feature_columns = numeric_columns + categorical_columns
    linear_feature_columns = [location, cross_all]
    return dnn_feature_columns, linear_feature_columns
Exemplo n.º 5
0
  def _prepare_for_crossing(self, key_name, num_bck, boundaries):
    """Prepares features for crossing.

    Whether they're continuous or categorical matters, and
    whether we have the whole dictionary or not.

    Args:
      key_name: A string representing the name of the feature
      num_bck: How many buckets to use when we know # of distinct values
      boundaries: Range used for boundaries when bucketinizing
    Returns:
      key name
    """
    key = None
    if key_name in self.continuous.keys():
      if boundaries is not None:
        # Note that cont[key_name] is a source column
        key = tfc.bucketized_column(self.continuous[key_name], boundaries)
      else:
        # We can count all the values in the dataset. Ex: boolean.
        # Note that key_name is a string
        key = tfc.categorical_column_with_identity(key_name, num_bck)
    elif key_name in self.categorical.keys():
      # It is also possible to use the categorical column instead of the
      # column name. i.e key = cat[key_name]
      key = key_name
    else:
      key = key_name

    return key
Exemplo n.º 6
0
    def tf_inputs_dataframe(self, batch_size=1, buffer_size=1000):
        dataframe = read_csv(
            os.path.join(os.path.dirname(self.json_filename),
                         self.data_description["csv"]))
        labels_name = 'ga_edd'
        y_name = labels_name

        for column_name in dataframe.columns:
            if column_name.startswith('_'):
                dataframe.pop(column_name)

        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            r = max(dataframe[header]) - min(dataframe[header])
            dataframe[header] = (dataframe[header] -
                                 min(dataframe[header])) / r

        dataframe = dataframe[(dataframe[y_name] != '.')
                              & (notna(dataframe[y_name])) &
                              (notnull(dataframe[y_name]))].copy()
        dataframe = dataframe.astype({y_name: 'int32'})

        feature_columns = []
        feature_names = []
        num_channels = 0
        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            feature_columns.append(feature_column.numeric_column(header))
            feature_names.append(header)
            num_channels += 1

        num_identity = 2
        for header in [
                'hiv', 'current_smoker', 'former_smoker', 'chronic_htn',
                'preg_induced_htn', 'diabetes', 'gest_diabetes'
        ]:
            col = feature_column.categorical_column_with_identity(
                header, num_identity)
            col = feature_column.indicator_column(col)
            feature_columns.append(col)
            feature_names.append(header)
            num_channels += num_identity

        self.num_channels = num_channels

        feature_layer = tf.keras.layers.DenseFeatures(
            feature_columns=feature_columns)
        dataframe = dataframe.copy()
        labels = dataframe.pop(labels_name)

        dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        dataset = dataset.shuffle(buffer_size=buffer_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.map(lambda x, y: (feature_layer(x), y))

        return dataset
Exemplo n.º 7
0
def test_identity_feature_column():
    sample = {'price': [[1], [2], [3], [0]]}
    # price_column = feature_column.numeric_column('price')
    price_column = feature_column.categorical_column_with_identity(
        key='price', num_buckets=4)
    indicator = feature_column.indicator_column(price_column)
    price_column_tensor = feature_column.input_layer(sample, [indicator])

    with tf.Session() as session:
        print(session.run([price_column_tensor]))
Exemplo n.º 8
0
def create_deep_feature_columns():
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))

  columns = [matchScore, matchType, triggerNum, triggerRank, sceneType,
             phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("deep feature columns:", columns)
  return columns
Exemplo n.º 9
0
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets):
    # Pass-through columns
    transformed = inputs.copy()
    del transformed['pickup_datetime']

    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0,
                                             name='scale_{}'.format(lon_col))(
                                                 inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0,
                                             name='scale_{}'.format(lat_col))(
                                                 inputs[lat_col])

    # Adding Euclidean dist (no need to be accurate: NN will calibrate it)
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickup_longitude'], inputs['pickup_latitude'],
        inputs['dropoff_longitude'], inputs['dropoff_latitude']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # hour of day from timestamp of form '2010-02-08 09:17:00+00:00'
    transformed['hourofday'] = layers.Lambda(
        lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2),
                                       out_type=tf.dtypes.int32),
        name='hourofday')(inputs['pickup_datetime'])
    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', num_buckets=24))

    latbuckets = np.linspace(0, 1, nbuckets).tolist()
    lonbuckets = np.linspace(0, 1, nbuckets).tolist()
    b_plat = fc.bucketized_column(feature_columns['pickup_latitude'],
                                  latbuckets)
    b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'],
                                  latbuckets)
    b_plon = fc.bucketized_column(feature_columns['pickup_longitude'],
                                  lonbuckets)
    b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'],
                                  lonbuckets)
    ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4)
    feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100)

    return transformed, feature_columns
Exemplo n.º 10
0
def create_feature_columns():
  # user feature
  phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneBrand = fc.embedding_column(phoneBrandId, 20)
  phoneResolution = fc.embedding_column(phoneResolutionId, 10)
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand,
                        phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Exemplo n.º 11
0
def categorical_embedding_with_indices(feature_tensor, feature_info, file_io: FileIO):
    """
    Converts input integer tensor into categorical embedding.
    Works by converting the categorical indices in the input feature_tensor,
    represented as integer values, into categorical embeddings based on the feature_info.

    Parameters
    ----------
    feature_tensor : Tensor object
        int feature tensor
    feature_info : dict
        Dictionary representing the configuration parameters for the specific feature from the FeatureConfig
    file_io : FileIO object
        FileIO handler object for reading and writing

    Returns
    -------
    Tensor object
        categorical embedding for the input feature_tensor

    Notes
    -----
    Args under feature_layer_info:
        num_buckets : int
            Maximum number of categorical values
        default_value : int
            default value to be assigned to indices out of the num_buckets range
        embedding_size : int
            dimension size of the categorical embedding

    String based categorical features should already be converted into numeric indices
    """
    feature_layer_info = feature_info.get("feature_layer_info")

    categorical_fc = feature_column.categorical_column_with_identity(
        CATEGORICAL_VARIABLE,
        num_buckets=feature_layer_info["args"]["num_buckets"],
        default_value=feature_layer_info["args"].get("default_value", None),
    )
    embedding_fc = feature_column.embedding_column(
        categorical_fc, dimension=feature_layer_info["args"]["embedding_size"], trainable=True
    )

    embedding = layers.DenseFeatures(
        embedding_fc,
        name="{}_embedding".format(feature_info.get("node_name", feature_info["name"])),
    )({CATEGORICAL_VARIABLE: feature_tensor})
    embedding = tf.expand_dims(embedding, axis=1)

    return embedding
Exemplo n.º 12
0
def categorical_column(key, vocabulary_size=None,
                       vocabulary_list=None,
                       vocabulary_file=None,
                       num_oov_buckets=0):
    if vocabulary_size:
        categorical_col = feature_column.categorical_column_with_identity(key, vocabulary_size)
        return categorical_col
    elif vocabulary_list:
        assert isinstance(vocabulary_list[0], six.string_types), "Vocabulary must be sequence of string"
        categorical_col = feature_column.categorical_column_with_vocabulary_list(key, vocabulary_list, num_oov_buckets)
        return categorical_col
    elif vocabulary_file:
        categorical_col = feature_column.categorical_column_with_vocabulary_file(key, vocabulary_file, num_oov_buckets)
        return categorical_col
Exemplo n.º 13
0
def test_categorical_identity_column():
    # 1. Input features
    price = {'price': [[3], [1], [2], [0]]}
    # 2. Feature columns (Sparse)
    identity_feature_column = feature_column.categorical_column_with_identity(
        key='price', num_buckets=4)
    # 2. Feature columns (Dense)
    # Convert the Categorical Column to Dense Column
    indicator_column = feature_column.indicator_column(identity_feature_column)
    # 3. Feature tensor
    identity_feature_tensor = feature_column.input_layer(
        price, [indicator_column])
    with tf.Session() as session:
        print(session.run([identity_feature_tensor]))
Exemplo n.º 14
0
def create_feature_columns():
    # user feature
    driver_age_class = fc.embedding_column(
        fc.categorical_column_with_identity("driver_age", num_buckets=7, default_value=0), 32)

    # item feature
    pax_age_class = fc.embedding_column(fc.categorical_column_with_identity("pax_age", num_buckets=7, default_value=0),
                                        32)

    pax_des = fc.categorical_column_with_hash_bucket("des_id", 10000)
    pax_des_embed = fc.embedding_column(pax_des, 32)

    # context feature
    pax_price = tf.feature_column.numeric_column('price_id', default_value=0.0)
    pax_price_splits = tf.feature_column.bucketized_column(
        pax_price, boundaries=[10 * 100, 20 * 100, 30 * 100, 40 * 100, 50 * 100, 60 * 100, 70 * 100, 80 * 100, 90 * 100,
                               100 * 100, 110 * 100, 120 * 100])
    pax_price_embed = fc.embedding_column(pax_price_splits, 32)

    seq_cols = ['hist_price_id', 'hist_des_id']
    # hist_price_seq_embed = fc.embedding_column(fc.categorical_column_with_vocabulary_file(
    #     key='hist_price_id',
    #     vocabulary_file='./map.txt',
    #     num_oov_buckets=0), 32)
    # hist_des_seq_embed = fc.embedding_column(
    #     fc.categorical_column_with_vocabulary_file(key='hist_des_id', vocabulary_file='./map.txt',
    #                                                default_value=0), dimension=32)

    hist_price_seq_embed = fc.numeric_column(key='hist_price_id', shape=(3,), default_value=[0.0] * 3, dtype=tf.float32)

    hist_des_seq_embed = fc.numeric_column(key='hist_des_id', shape=(3,), default_value=[0.0] * 3, dtype=tf.float32)

    global my_feature_columns
    my_feature_columns = [driver_age_class, pax_age_class, pax_des_embed, pax_price_embed, hist_price_seq_embed,
                          hist_des_seq_embed]
    return my_feature_columns
Exemplo n.º 15
0
    def create_features_columns(self):
        # 向量类特征
        user_vector = fc.numeric_column(key="user_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)

        # 分桶类特征
        age = fc.numeric_column(key="age",
                                shape=(1, ),
                                default_value=[0],
                                dtype=tf.int64)
        age = fc.bucketized_column(
            input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')

        # 分类特征
        city = fc.categorical_column_with_identity(key="city",
                                                   num_buckets=1000,
                                                   default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')

        # hash特征
        device_id = fc.categorical_column_with_hash_bucket(
            key="device_id", hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id,
                                        dimension=32,
                                        combiner='mean')

        item_id = fc.categorical_column_with_hash_bucket(
            key="item_id", hash_bucket_size=10000, dtype=tf.int64)
        item_id = fc.embedding_column(device_id, dimension=32, combiner='mean')

        self.user_columns["user_vector"] = user_vector
        self.user_columns["age"] = age
        self.user_columns["city"] = city
        self.user_columns["device_id"] = device_id
        self.item_columns["item_vector"] = item_vector
        self.item_columns["item_id"] = item_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.user_columns.values() + self.item_columns.values())

        return self
Exemplo n.º 16
0
def create_feature_columns():
  # user feature
  bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64)
  c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64)
  cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64)
  sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64)
  pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64)
  bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
  c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
  cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
  sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
  pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

  # item feature
  pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64)
  sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64)
  bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64)
  c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64)
  cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64)

  # context feature
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
  matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0))
  postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200))
  triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50))
  triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50))
  sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0))
  hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))
  phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000))
  phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500))
  phoneOs = fc.indicator_column(
    fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
  tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab",
        ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0))

  pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid")
  bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid")
  cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid")
  c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id")
  sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid")
  global my_feature_columns
  my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution,
             phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer]
  my_feature_columns += pid_embed
  my_feature_columns += sid_embed
  my_feature_columns += bid_embed
  my_feature_columns += cid_embed
  my_feature_columns += c1id_embed
  print("feature columns:", my_feature_columns)
  return my_feature_columns
Exemplo n.º 17
0
    def transform(self, output_tensors):
        input_tensor_name = self.parameters.get("input_tensor")
        output_tensor_name = self.parameters.get("output_tensor")
        default_value = self.parameters.get("default_value", '-1')
        if self.parameters.has_key("bucket_size"):
            bucket_size = self.parameters.get("bucket_size")
        else:
            msg = "parameters error, sparse_column_with_integerized_feature must need bucket_size"
            logger.error(msg)
            raise ParametersError(msg)

        output_tensors[output_tensor_name] = fc.categorical_column_with_identity(
            key=input_tensor_name,
            num_buckets=bucket_size,
            default_value=default_value
        )
Exemplo n.º 18
0
def create_feature_columns():
    # 当我们对类目的分类数未知的时候变量离散化方法,我们通过hash的方式固定指定类目数
    Brand = fc.categorical_column_with_hash_bucket("Brand", 1000)
    # 固定类目下的变量离散化方法
    phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs",
                                                         ["android", "ios"],
                                                         default_value=0)
    # 连续变量的处理
    brandPrefer = fc.numeric_column("brandPrefer",
                                    default_value=0.0,
                                    normalizer_fn=truncate)
    # onehotencoding
    matchType = fc.categorical_column_with_identity("matchType",
                                                    9,
                                                    default_value=0)
    # fc.indicator_column 可以把以上的特征dense化
    return [Brand, phoneOs, brandPrefer, matchType]
Exemplo n.º 19
0
def transform(inputs):
    transformed = inputs.copy()

    for feature_transform_info in FEATURE_TRANSFORM_INFO_EXECUTE_ARRAY:
        if feature_transform_info.op_name == TransformOp.HASH:
            transformed[feature_transform_info.output_name] = CategoryHash(
                feature_transform_info.param)(
                    transformed[feature_transform_info.input_name])
        elif feature_transform_info.op_name == TransformOp.BUCKETIZE:
            transformed[feature_transform_info.output_name] = NumericBucket(
                feature_transform_info.param)(
                    transformed[feature_transform_info.input_name])
        elif feature_transform_info.op_name == TransformOp.LOOKUP:
            transformed[feature_transform_info.output_name] = CategoryLookup(
                feature_transform_info.param)(
                    transformed[feature_transform_info.input_name])
        elif feature_transform_info.op_name == TransformOp.GROUP:
            group_inputs = [
                transformed[name] for name in feature_transform_info.input_name
            ]
            offsets = list(
                itertools.accumulate([0] + feature_transform_info.param[:-1]))
            transformed[feature_transform_info.output_name] = Group(offsets)(
                group_inputs)
        elif feature_transform_info.op_name == TransformOp.EMBEDDING:
            # The num_buckets should be calcualte from the group items
            group_identity = fc.categorical_column_with_identity(
                feature_transform_info.input_name,
                num_buckets=feature_transform_info.param[0],
            )
            group_embedding = fc.embedding_column(
                group_identity, dimension=feature_transform_info.param[1])
            transformed[feature_transform_info.
                        output_name] = tf.keras.layers.DenseFeatures(
                            [group_embedding])({
                                feature_transform_info.input_name:
                                transformed[feature_transform_info.input_name]
                            })
        elif feature_transform_info.op_name == TransformOp.ARRAY:
            transformed[feature_transform_info.output_name] = [
                transformed[name] for name in feature_transform_info.input_name
            ]

    return tuple([transformed[name] for name in TRANSFORM_OUTPUTS])
Exemplo n.º 20
0
def create_linear_feature_columns():
  phoneBrand = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
  phoneResolution = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
  phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)
  matchScore = fc.numeric_column("matchScore", default_value=0.0)
  popScore = fc.numeric_column("popScore", default_value=0.0)
  brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate)
  cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate)
  catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate)
  sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate)
  matchType = fc.categorical_column_with_identity("matchType", 9, default_value=0)
  position = fc.categorical_column_with_identity("position", 201, default_value=200)
  triggerNum = fc.categorical_column_with_identity("triggerNum", 51, default_value=50)
  triggerRank = fc.categorical_column_with_identity("triggerRank", 51, default_value=50)
  sceneType = fc.categorical_column_with_identity("type", 2, default_value=0)
  hour = fc.categorical_column_with_identity("hour", 24, default_value=0)
  columns = [phoneBrand, phoneResolution, phoneOs, matchScore, popScore, brandPrefer, cate2Prefer, catePrefer,
          sellerPrefer, matchType, position, triggerRank, triggerNum, sceneType, hour]
  print("linear feature columns:", columns)
  return columns
Exemplo n.º 21
0
# numeric cols
for header in ['age']:
    feature_columns.append(feature_column.numeric_column(header))

age = feature_column.numeric_column("age")

# bucketized cols
age_buckets = feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
#feature_columns.append(age_buckets)

#for header in ['KreditFF','Insolvent','Reglerad','man']:
#categorical_column =feature_column.categorical_column_with_identity(key=header, num_buckets=100, default_value=0)
#categorical_columns.append(feature_column.categorical_column_with_identity(key=header, num_buckets=100, default_value=0))

categorical_column_1 = feature_column.categorical_column_with_identity(
    key='KreditFF360', num_buckets=100, default_value=0)
categorical_column_2 = feature_column.categorical_column_with_identity(
    key='A_Insolvent360', num_buckets=100, default_value=0)
categorical_column_3 = feature_column.categorical_column_with_identity(
    key='A_Utslag360', num_buckets=100, default_value=0)
categorical_column_4 = feature_column.categorical_column_with_identity(
    key='man', num_buckets=2, default_value=0)

feature_columns = [
    tf.feature_column.indicator_column(categorical_column_1),
    tf.feature_column.indicator_column(categorical_column_2),
    tf.feature_column.indicator_column(categorical_column_3)
    #tf.feature_column.indicator_column(categorical_column_m)
]

#feature_columns.append(age_buckets)
    sess.run(tf.tables_initializer())
    print(sess.run([color_embeding_dense_tensor]))
"""--------------------------------------------- bucketized_column -------------------------------------------------"""
"""bucketized_column"""
with tf.Session() as session:
    price = {'price': [[5.], [15.], [25.], [35.]]}
    price_column = feature_column.numeric_column('price')
    bucket_price = feature_column.bucketized_column(price_column,
                                                    [10, 20, 30, 40])
    price_bucket_tensor = feature_column.input_layer(price, [bucket_price])
    print(session.run([price_bucket_tensor]))
"""--------------------------------------------- Categorical_column -------------------------------------------------"""
"""categorical_column_with_identity"""
with tf.Session() as sess:
    color_data = {'color': [[2], [5], [-1], [0]]}
    color_column = feature_column.categorical_column_with_identity('color', 7)
    color_column_identy = feature_column.indicator_column(color_column)
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    print(sess.run([color_dense_tensor]))
"""categorical_column_with_vocabulary_list"""
with tf.Session() as sess:
    color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]}
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    color_column_identy = feature_column.indicator_column(color_column)
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])
    sess.run(tf.global_variables_initializer())
Exemplo n.º 23
0
def main(args):
    in_csv_path = Path(args.in_csv)
    y_name = args.y_var
    utils.setupLogFile(in_csv_path.parent)

    logging.info(' --- RUN for outvar {}, target {} ----- '.format(
        args.outvar, y_name))
    if not in_csv_path.exists():
        logging.error('Could not find the input file')

    try:
        dataframe = read_csv(str(in_csv_path))
        for column_name in dataframe.columns:
            if column_name.startswith('_'):
                dataframe.pop(column_name)

        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            r = max(dataframe[header]) - min(dataframe[header])
            dataframe[header] = (dataframe[header] -
                                 min(dataframe[header])) / r

        train_whole = dataframe[(dataframe[y_name] != '.')
                                & (notna(dataframe[y_name])) &
                                (notnull(dataframe[y_name]))].copy()
        train_whole = train_whole.astype({y_name: 'int32'})

        logging.info(
            ' Number of trainig samples in the selected set: {}'.format(
                len(train_whole)))

        batch_size = 32  # A small batch sized is used for demonstration purposes
        feature_columns = []
        feature_names = []
        for header in [
                'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb',
                'mom_height_in'
        ]:
            feature_columns.append(feature_column.numeric_column(header))
            feature_names.append(header)

        for header in [
                'hiv', 'current_smoker', 'former_smoker', 'chronic_htn',
                'preg_induced_htn', 'diabetes', 'gest_diabetes'
        ]:
            col = feature_column.categorical_column_with_identity(header, 2)
            col = feature_column.indicator_column(col)
            feature_columns.append(col)
            feature_names.append(header)

        all_pred = [0] * len(dataframe)

        print('****** args.all_train is: {}'.format(args.all_train))
        if not args.all_train:
            trimester = train_whole['trimester'].values.tolist()
            min_trim = min(trimester)
            max_trim = max(trimester)

            model_trim = None
            model_2trim = None
            model_3trim = None

            if min_trim == max_trim:
                # This data is only for one of the trimesters, run the training for one of them.
                logging.info(
                    'Training only for Trimester regression: {}'.format(
                        min_trim + 2))
                if min_trim == 0:
                    model_2trim = train_trimester_2(train_whole,
                                                    feature_columns,
                                                    batch_size, y_name)
                    if model_2trim is None:
                        raise Exception('2nd trimester model empty')
                else:
                    model_3trim = train_trimester_3(train_whole,
                                                    feature_columns,
                                                    batch_size, y_name)
                    if model_3trim is None:
                        raise Exception('3rd trimester model empty')
            else:
                model_trim = train_trimester(train_whole, feature_columns,
                                             batch_size, 'trimester')
                trim_2_df = train_whole[train_whole['trimester'] == 0]
                model_2trim = train_trimester_2(trim_2_df, feature_columns,
                                                batch_size, y_name)
                trim_3_df = train_whole[train_whole['trimester'] == 1]
                model_3trim = train_trimester_3(trim_3_df, feature_columns,
                                                batch_size, y_name)
                logging.info('-- done training for all three ')
                if model_trim is None or model_2trim is None and model_3trim is None:
                    raise Exception(
                        'One of the models came back empty during the classification/regression phase'
                    )

            # Classify the dataset if this is a multi-trimester dataset
            if model_trim is not None and model_2trim is not None and model_3trim is not None:
                logging.info('Creating predictions for the full dataset')
                ds = df_to_dataset(dataframe,
                                   shuffle=False,
                                   batch_size=32,
                                   labels_name=y_name)
                ga_2trim = model_2trim.predict(ds)
                ga_3trim = model_3trim.predict(ds)

                ds = df_to_dataset(dataframe,
                                   shuffle=False,
                                   batch_size=32,
                                   labels_name='trimester')
                c_p = (model_trim.predict(ds) > 0).astype("int32")

                all_pred = [
                    g_2[0] if c == 0 else g_3[0]
                    for (g_2, g_3, c) in zip(ga_2trim, ga_3trim, c_p)
                ]
                logging.info('Length of all predictions list is: {}'.format(
                    len(all_pred)))

            elif min_trim == max_trim:
                ds = df_to_dataset(dataframe,
                                   shuffle=False,
                                   batch_size=32,
                                   labels_name=y_name)
                if min_trim == 0 and model_2trim is not None:
                    all_pred = model_2trim.predict(ds)
                elif min_trim == 1 and model_3trim is not None:
                    all_pred = model_3trim.predict(ds)
                else:
                    logging.error('Either 2nd or 3rd trimester data is null')
            else:
                logging.error('We are in unknown territory, exiting')

        else:  # Per trimester if/else
            model_g = train_general(train_whole, feature_columns, batch_size,
                                    y_name)
            ds = df_to_dataset(dataframe,
                               shuffle=False,
                               batch_size=32,
                               labels_name=y_name)
            all_pred = model_g.predict(ds)

        logging.info('Creating output dataset')
        out_df = dataframe[['PatientID', 'filename', 'studydate']].copy()
        out_df[args.outvar] = all_pred
        out_path = in_csv_path.parent / (args.outvar + '.csv')
        logging.info('Should output to: {}'.format(out_path))
        out_df.to_csv(out_path)
    except Exception as e:
        logging.error('Error: \n{}'.format(e))
        logging.error(e)
Exemplo n.º 24
0
def categorical_indicator_with_vocabulary_file(feature_tensor, feature_info,
                                               file_io: FileIO):
    """
    Converts a string tensor into a categorical one-hot representation.
    Works by using a vocabulary file to convert the string tensor into categorical indices
    and then converting the categories into one-hot representation.

    Args:
        feature_tensor: String feature tensor
        feature_info: Dictionary representing the configuration parameters for the specific feature from the FeatureConfig

    Returns:
        Categorical one-hot representation of input feature_tensor

    Args under feature_layer_info:
        vocabulary_file: string; path to vocabulary CSV file for the input tensor containing the vocabulary to look-up.
                        uses the "key" named column as vocabulary of the 1st column if no "key" column present.
        max_length: int; max number of rows to consider from the vocabulary file.
                        if null, considers the entire file vocabulary.
        num_oov_buckets: int - optional; number of out of vocabulary buckets/slots to be used to
                         encode strings into categorical indices. If not specified, the default is 1.

    NOTE:
    The vocabulary CSV file must contain two columns - key, id,
    where the key is mapped to one id thereby resulting in a
    many-to-one vocabulary mapping.
    If id field is absent, a unique whole number id is assigned by default
    resulting in a one-to-one mapping
    """
    #
    ##########################################################################
    #
    # NOTE:
    # Current bug[1] with saving a Keras model when using
    # feature_column.categorical_column_with_vocabulary_list.
    # Tracking the issue currently and should be able to upgrade
    # to current latest stable release 2.2.0 to test.
    #
    # Can not use TF2.1.0 due to issue[2] regarding saving Keras models with
    # custom loss, metric layers
    #
    # Can not use TF2.2.0 due to issues[3, 4] regarding incompatibility of
    # Keras Functional API models and Tensorflow
    #
    # References:
    # [1] https://github.com/tensorflow/tensorflow/issues/31686
    # [2] https://github.com/tensorflow/tensorflow/issues/36954
    # [3] https://github.com/tensorflow/probability/issues/519
    # [4] https://github.com/tensorflow/tensorflow/issues/35138
    #
    # CATEGORICAL_VARIABLE = "categorical_variable"
    # categorical_fc = feature_column.categorical_column_with_vocabulary_list(
    #     CATEGORICAL_VARIABLE,
    #     vocabulary_list=vocabulary_list,
    #     default_value=feature_layer_info["args"].get("default_value", -1),
    #     num_oov_buckets=feature_layer_info["args"].get("num_oov_buckets", 0),
    # )
    #
    # indicator_fc = feature_column.indicator_column(categorical_fc)
    #
    # categorical_one_hot = layers.DenseFeatures(
    #     indicator_fc,
    #     name="{}_one_hot".format(feature_info.get("node_name", feature_info["name"])),
    # )({CATEGORICAL_VARIABLE: feature_tensor})
    # categorical_one_hot = tf.expand_dims(categorical_one_hot, axis=1)
    #
    ##########################################################################
    #
    feature_tensor_indices, vocabulary_keys, num_oov_buckets = categorical_indices_from_vocabulary_file(
        feature_info, feature_tensor, file_io)

    vocabulary_size = len(set(vocabulary_keys))

    categorical_identity_fc = feature_column.categorical_column_with_identity(
        CATEGORICAL_VARIABLE, num_buckets=vocabulary_size + num_oov_buckets)
    indicator_fc = feature_column.indicator_column(categorical_identity_fc)

    categorical_one_hot = layers.DenseFeatures(
        indicator_fc,
        name="{}_one_hot".format(
            feature_info.get("node_name", feature_info["name"])),
    )({
        CATEGORICAL_VARIABLE: feature_tensor_indices
    })
    categorical_one_hot = tf.expand_dims(categorical_one_hot, axis=1)

    return categorical_one_hot
Exemplo n.º 25
0
                                                  boundaries=[0, 3, 5, 7, 9])
feature_columns.append(tenure_buckets)

# indicator cols
geography = feature_column.categorical_column_with_vocabulary_list(
    'Geography', ['France', 'Spain', 'Germany'])
geography_one_hot = feature_column.indicator_column(geography)
gender = feature_column.categorical_column_with_vocabulary_list(
    'Gender', ['Female', 'Male'])
gender_one_hot = feature_column.indicator_column(gender)

feature_columns.append(geography_one_hot)
feature_columns.append(gender_one_hot)

for header in ['HasCrCard', 'IsActiveMember']:
    col = feature_column.categorical_column_with_identity(key=header,
                                                          num_buckets=2)
    col_one_hot = feature_column.indicator_column(col)
    feature_columns.append(col_one_hot)

# embedding cols
# hashed feature cols
# crossed cols

# Step 7 : Create a feature layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

# Step 8 : Create, compile, and train the model
Exemplo n.º 26
0
def transform_from_code_gen(source_inputs):
    inputs = source_inputs.copy()

    education_hash_out = CategoryHash(education_hash.param)(
        inputs["education"])
    occupation_hash_out = CategoryHash(occupation_hash.param)(
        inputs["occupation"])
    native_country_hash_out = CategoryHash(native_country_hash.param)(
        inputs["native_country"])
    workclass_lookup_out = CategoryLookup(workclass_lookup.param)(
        inputs["workclass"])
    marital_status_lookup_out = CategoryLookup(marital_status_lookup.param)(
        inputs["marital_status"])
    relationship_lookup_out = CategoryLookup(relationship_lookup.param)(
        inputs["relationship"])
    race_lookup_out = CategoryLookup(race_lookup.param)(inputs["race"])
    sex_lookup_out = CategoryLookup(sex_lookup.param)(inputs["sex"])
    age_bucketize_out = NumericBucket(age_bucketize.param)(inputs["age"])
    capital_gain_bucketize_out = NumericBucket(capital_gain_bucketize.param)(
        inputs["capital_gain"])
    capital_loss_bucketize_out = NumericBucket(capital_loss_bucketize.param)(
        inputs["capital_loss"])
    hours_per_week_bucketize_out = NumericBucket(
        hours_per_week_bucketize.param)(inputs["hours_per_week"])

    group1_out = Group(group1.param)([
        workclass_lookup_out,
        hours_per_week_bucketize_out,
        capital_gain_bucketize_out,
        capital_loss_bucketize_out,
    ])
    group2_out = Group(group2.param)([
        education_hash_out,
        marital_status_lookup_out,
        relationship_lookup_out,
        occupation_hash_out,
    ])
    group3_out = Group(group3.param)([
        age_bucketize_out,
        sex_lookup_out,
        race_lookup_out,
        native_country_hash_out,
    ])

    group1_wide_embedding_column = fc.embedding_column(
        fc.categorical_column_with_identity(
            "group1", num_buckets=group1_embedding_wide.param[0]),
        dimension=group1_embedding_wide.param[1],
    )
    group1_embedding_wide_out = tf.keras.layers.DenseFeatures(
        [group1_wide_embedding_column])({
            "group1": group1_out
        })

    group2_wide_embedding_column = fc.embedding_column(
        fc.categorical_column_with_identity(
            "group2", num_buckets=group2_embedding_wide.param[0]),
        dimension=group2_embedding_wide.param[1],
    )
    group2_embedding_wide_out = tf.keras.layers.DenseFeatures(
        [group2_wide_embedding_column])({
            "group2": group2_out
        })

    group1_deep_embedding_column = fc.embedding_column(
        fc.categorical_column_with_identity(
            "group1", num_buckets=group1_embedding_deep.param[0]),
        dimension=group1_embedding_deep.param[1],
    )
    group1_embedding_deep_out = tf.keras.layers.DenseFeatures(
        [group1_deep_embedding_column])({
            "group1": group1_out
        })

    group2_deep_embedding_column = fc.embedding_column(
        fc.categorical_column_with_identity(
            "group2", num_buckets=group2_embedding_deep.param[0]),
        dimension=group2_embedding_deep.param[1],
    )
    group2_embedding_deep_out = tf.keras.layers.DenseFeatures(
        [group2_deep_embedding_column])({
            "group2": group2_out
        })

    group3_deep_embedding_column = fc.embedding_column(
        fc.categorical_column_with_identity(
            "group3", num_buckets=group3_embedding_deep.param[0]),
        dimension=group3_embedding_deep.param[1],
    )
    group3_embedding_deep_out = tf.keras.layers.DenseFeatures(
        [group3_deep_embedding_column])({
            "group3": group3_out
        })

    wide_embeddings_out = [
        group1_embedding_wide_out,
        group2_embedding_wide_out,
    ]
    deep_embeddings_out = [
        group1_embedding_deep_out,
        group2_embedding_deep_out,
        group3_embedding_deep_out,
    ]

    return wide_embeddings_out, deep_embeddings_out
Exemplo n.º 27
0
def create_feature_columns():
    # user feature
    bids = fc.categorical_column_with_hash_bucket("behaviorBids",
                                                  10000,
                                                  dtype=tf.int64)
    c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids",
                                                   100,
                                                   dtype=tf.int64)
    cids = fc.categorical_column_with_hash_bucket("behaviorCids",
                                                  10000,
                                                  dtype=tf.int64)
    sids = fc.categorical_column_with_hash_bucket("behaviorSids",
                                                  10000,
                                                  dtype=tf.int64)
    pids = fc.categorical_column_with_hash_bucket("behaviorPids",
                                                  500000,
                                                  dtype=tf.int64)
    bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
    c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
    cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
    sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
    pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")
    pid_embed = fc.embedding_column(pids_weighted, 64)
    bid_embed = fc.embedding_column(bids_weighted, 32)
    cid_embed = fc.embedding_column(cids_weighted, 48)
    c1id_embed = fc.embedding_column(c1ids_weighted, 10)
    sid_embed = fc.embedding_column(sids_weighted, 32)
    phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    phoneBrand = fc.embedding_column(phoneBrandId, 20)
    phoneResolutionId = fc.categorical_column_with_hash_bucket(
        "phoneResolution", 500)
    phoneResolution = fc.embedding_column(phoneResolutionId, 10)
    phoneOs = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("phoneOs",
                                                   ["android", "ios"],
                                                   default_value=0))
    gender = fc.indicator_column(
        fc.categorical_column_with_identity("gender",
                                            num_buckets=3,
                                            default_value=0))
    age_class = fc.indicator_column(
        fc.categorical_column_with_identity("age_class",
                                            num_buckets=7,
                                            default_value=0))
    has_baby = fc.indicator_column(
        fc.categorical_column_with_identity("has_baby",
                                            num_buckets=2,
                                            default_value=0))
    baby_gender = fc.indicator_column(
        fc.categorical_column_with_identity("baby_gender",
                                            num_buckets=3,
                                            default_value=0))
    baby_age = fc.indicator_column(
        fc.categorical_column_with_identity("baby_age",
                                            num_buckets=7,
                                            default_value=0))
    grade = fc.indicator_column(
        fc.categorical_column_with_identity("grade",
                                            num_buckets=7,
                                            default_value=0))
    rfm_type = fc.indicator_column(
        fc.categorical_column_with_identity("bi_rfm_type",
                                            num_buckets=12,
                                            default_value=0))
    city_id = fc.categorical_column_with_hash_bucket("city", 700)
    city = fc.embedding_column(city_id, 16)
    userType = fc.indicator_column(
        fc.categorical_column_with_identity("user_type", 6, default_value=0))
    hour = fc.indicator_column(
        fc.categorical_column_with_identity("hour", 24, default_value=0))

    global my_feature_columns
    my_feature_columns = [
        userType, hour, gender, age_class, has_baby, baby_gender, baby_age,
        grade, rfm_type, phoneBrand, phoneResolution, phoneOs, pid_embed,
        sid_embed, bid_embed, cid_embed, c1id_embed, city
    ]
    print("feature columns:", my_feature_columns)
    return my_feature_columns
Exemplo n.º 28
0
def run_exp(params):
    exp.tag(params)
    URL = 'mushroom/all.csv'
    dataframe = pd.read_csv(URL)
    dataframe.head()

    specs, target = df_column_specs(dataframe, params=None)

    train, test = train_test_split(dataframe, test_size=0.2)
    train, val = train_test_split(train, test_size=0.2)
    print(len(train), 'train examples')
    print(len(val), 'validation examples')
    print(len(test), 'test examples')

    batch_size = params.batch or 32

    def df_to_dataset(dataframe, shuffle=True, batch_size=batch_size):
        dataframe = dataframe.copy()
        labels = dataframe.pop('target')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
        return ds

    # A utility method to create a tf.data dataset from a Pandas Dataframe

    feature_columns = []
    # for mushroom we know the cols are all categorical, so we're not being too
    # careful here
    for col in specs:
        feature_columns.append(
            feature_column.indicator_column(
                feature_column.categorical_column_with_identity(
                    col['name'], col['card'])))

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    train_ds = df_to_dataset(train, batch_size=batch_size)
    val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
    test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

    # ## Create Model
    #

    # In[20]:

    featurizer = tf.keras.Sequential([feature_layer], name='featurizer')

    l1 = params.l1 or 0.0
    inner_model_logits = tf.keras.Sequential([
        layers.Dense(1,
                     kernel_regularizer=tf.keras.regularizers.l1(l=l1),
                     kernel_initializer=tf.random_normal_initializer(),
                     bias_initializer=tf.random_normal_initializer())
    ],
                                             name='inner_model_logits')

    logit_prob = tf.keras.Sequential([layers.Activation('sigmoid')],
                                     name='logit_prob')

    features_prob = tf.keras.Sequential([inner_model_logits, logit_prob])

    # try out the model
    example_batch, label_batch = next(iter(train_ds))
    features = featurizer(example_batch)
    logits = inner_model_logits(features)
    logit_prob(logits)[:4]
    features_prob(features[:4])

    #  predictions = model(example_batch)[:,0]

    loss_object = tf.keras.losses.BinaryCrossentropy()

    if params.adam:
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    else:
        optimizer = tf.keras.optimizers.Ftrl(learning_rate=0.01)

#  features_prob.compile(optimizer=optimizer, loss=loss_object)

    def inner_model_logits_2_class(x):
        logits = inner_model_logits(x)
        return tf.concat([-logits, logits], axis=1)

    features_perturbed = projected_gradient_descent(inner_model_logits_2_class,
                                                    features,
                                                    0.4,
                                                    0.2,
                                                    5,
                                                    np.inf,
                                                    y=label_batch,
                                                    targeted=False)

    input_dim = features_prob.layers[0].inputs[0].shape[1]
    inputs = tf.keras.Input(shape=(input_dim, ))
    outputs = tf.keras.layers.Dense(
        1,
        activation='sigmoid',
        kernel_regularizer=tf.keras.regularizers.l1(l=l1),
        kernel_initializer=tf.random_normal_initializer(),
        bias_initializer=tf.random_normal_initializer())(inputs)
    features_prob_functional = tf.keras.Model(inputs=inputs, outputs=outputs)

    model = tf.keras.Sequential([
        featurizer, features_prob_functional
        # inner_model_logits,
        # logit_prob
    ])
    model(example_batch)[:4]

    def inner_model_probs_2_class(x):
        probs = features_prob_functional(x)
        return tf.concat([1 - probs, probs], axis=1)

    def mdl_to_2_class(mdl):
        def fn(x):
            probs = mdl(x)
            return tf.concat([1 - probs, probs], axis=1)

        return fn

    # test out adv perturbation, and closed form

    # features_perturbed_closed_form = adversarial.logistic_perturb(
    #   inner_model_logits, features, label_batch, 0.4)
    #
    # [features_perturbed_closed_form[0, :5], features_perturbed[0, :5]]

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_auc = tf.keras.metrics.AUC(name='train_auc')
    train_acc = tf.keras.metrics.BinaryAccuracy(name='train_acc')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_auc = tf.keras.metrics.AUC(name='test_auc')
    test_acc = tf.keras.metrics.BinaryAccuracy(name='test_acc')

    # In[26]:

    #@tf.function
    def train_step_perturb_input(examples, labels, eps, eps_step,
                                 num_pgd_steps):
        features = featurizer(examples)
        features_prob_functional(features)  # dummy call to force weights to be
        # created
        # this is done outside the tape, but it's ok because we've ensured
        # there are no trainable vars in the featurizer (no embeddings!)
        features_perturbed = adversarial.logistic_perturb(
            features_prob_functional, features, labels, eps)
        with tf.GradientTape() as tape:
            predictions = features_prob_functional(features_perturbed)
            loss = loss_object(labels, predictions[:, 0])
            loss += sum(features_prob_functional.losses)
        vars = featurizer.trainable_variables + \
               features_prob_functional.trainable_variables
        gradients = tape.gradient(loss, vars)
        optimizer.apply_gradients(zip(gradients, vars))
        train_loss(loss)
        train_auc(labels, tf.reshape(predictions, [-1]))
        train_acc(labels, tf.reshape(predictions, [-1]))

    def train_step_perturb_last_layer(examples,
                                      labels,
                                      eps,
                                      freeze_initial_layers=True):
        with tf.GradientTape() as tape:
            features = featurizer(examples)
            left, right = model_metrics.sub_models(inner_model_logits)
            penultimate_activations = left(features)

            right(penultimate_activations
                  )  # dummy call to force weights to be created
            d = penultimate_activations.shape[1]
            activations_perturbed = adversarial.logistic_perturb(
                right, penultimate_activations, labels, eps / math.sqrt(d))
            logits_perturbed = right(activations_perturbed)
            predictions = logit_prob(logits_perturbed)
            loss = loss_object(labels, predictions[:, 0])
            loss += sum(features_prob.losses)
        if freeze_initial_layers:
            mdl = inner_model_logits.layers[1]
        else:
            mdl = inner_model_logits
        vars = mdl.trainable_variables  # + featurizer.trainable_variables
        gradients = tape.gradient(loss, vars)
        optimizer.apply_gradients(zip(gradients, vars))

        train_loss(loss)
        train_auc(labels, tf.reshape(predictions, [-1]))

    # In[27]:

    #@tf.function
    def test_step(examples, labels):
        predictions = model(examples)[:, 0]
        t_loss = loss_object(labels, predictions)
        test_loss(t_loss)
        test_auc(labels, predictions)
        test_acc(labels, predictions)


# In[28]:

# example_batch, label_batch = next(iter(train_ds))
# predictions = model(example_batch)[:,0]
# loss_object(label_batch, predictions)

# In[29]:

    def get_all_x_y(ds):
        batches = []
        y = []
        for examples, labels in ds:
            x = featurizer(examples)
            batches = batches + [x]
            y = y + [labels]
        return tf.concat(batches, axis=0), tf.concat(y, axis=0)

    def reset_states():
        train_loss.reset_states()
        train_auc.reset_states()
        train_acc.reset_states()
        test_loss.reset_states()
        test_auc.reset_states()
        test_acc.reset_states()

    EPOCHS = params.epochs or 20
    if args.quick:
        EPOCHS = 2
    eps = params.eps or 0.0  # 1.3 ?
    num_pgd_steps = params.pgd_steps or 5
    perturb_input = params.perturb_input
    clean_epochs = params.clean_epochs or 2
    if args.quick:
        clean_epochs = 1
    eps_step_factor = params.eps_step_factor or 5.0
    for epoch in range(EPOCHS):
        reset_states()
        if epoch < clean_epochs:
            eps_val = 0
        else:
            eps_val = eps
        eps_step = eps_val / eps_step_factor

        for images, labels in train_ds:
            if perturb_input:
                train_step_perturb_input(images, labels, eps_val, eps_step,
                                         num_pgd_steps)
            else:
                # perturb penultimate layer, using closed form for binary prob output
                train_step_perturb_last_layer(images,
                                              labels,
                                              eps_val,
                                              freeze_initial_layers=True)

        for test_images, test_labels in test_ds:
            test_step(test_images, test_labels)

        template = 'Eps={}, Epoch {}, Loss: {}, AUC: {}, ' \
                   'Test Loss: {}, Test AUC: {}, Test AC: {}'
        if epoch % 5 == 0:
            print(
                template.format(eps_val, epoch + 1, train_loss.result(),
                                train_auc.result() * 100, test_loss.result(),
                                test_auc.result() * 100,
                                test_acc.result() * 100))

    # dummy examples to force model shapes to be set
    example_batch, label_batch = next(iter(train_ds))
    model(example_batch)

    model_file = 'mushroom_models/' + f'eps={eps}'

    #model.save(model_file, overwrite=True)
    model.save_weights(model_file + ".ckpt", overwrite=True)
    m = model_metrics.weight_metrics(model)
    print(f'weight metrics with eps={eps}:')
    print(m)

    x_test, y_test = get_all_x_y(test_ds)
    attribution_method = 'shap' if params.attribution_shap else 'ig'
    mdl_shap = features_prob_functional
    mdl_ig = inner_model_probs_2_class
    av_ig_pct1pct, ig_ent, av_gini, ig_results = \
      model_metrics.attribs_pct( mdl_shap, mdl_ig,
                                 x_test,
                                 y_test,
                                 attribution_method=attribution_method)
    print(f'eps={eps}: IG_pct1pct={av_ig_pct1pct}')

    print(f'****done with eps={eps} ****')
    print(f'****************************')

    params_and_results = params.mod(
        dict(test_auc=np.round(test_auc.result() * 100, 2),
             test_acc=np.round(test_acc.result() * 100, 2),
             ig_ent=np.round(ig_ent, 2),
             gini=np.round(av_gini, 3),
             ig_1p=np.round(av_ig_pct1pct[0], 2)))
    print('*** logging ****')
    print(params_and_results.dict())
    exp.log(params_and_results.dict())
    exp.save()
    exp.close()
    return ig_results
Exemplo n.º 29
0
def main():
    training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
        filename='trainingData.txt',
        target_dtype=np.int,
        features_dtype=np.int,
        target_column=0)

    test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
        filename='testData.txt',
        target_dtype=np.int,
        features_dtype=np.int,
        target_column=0)

    def input_fn_train():  # method used to deliver the training data
        x = {
                "blinkyE":tf.convert_to_tensor(training_set.data),
                 "inkyE": tf.convert_to_tensor(training_set.data),
                 "pinkyE": tf.convert_to_tensor(training_set.data),
                 "sueE": tf.convert_to_tensor(training_set.data),
                 "blinkyDist": tf.convert_to_tensor(training_set.data),
                 "inkyDist": tf.convert_to_tensor(training_set.data),
                 "pinkyDist": tf.convert_to_tensor(training_set.data),
                 "sueDist": tf.convert_to_tensor(training_set.data)
             }
        y = tf.convert_to_tensor(training_set.target)
        return x, y

    def input_fn_test():
        x = {
            "blinkyE": tf.convert_to_tensor(test_set.data),
            "inkyE": tf.convert_to_tensor(test_set.data),
            "pinkyE": tf.convert_to_tensor(test_set.data),
            "sueE": tf.convert_to_tensor(test_set.data),
            "blinkyDist": tf.convert_to_tensor(test_set.data),
            "inkyDist": tf.convert_to_tensor(test_set.data),
            "pinkyDist": tf.convert_to_tensor(test_set.data),
            "sueDist": tf.convert_to_tensor(test_set.data)
        }
        y = tf.convert_to_tensor(test_set.target)
        return x,y

    #Describe the feature columns
    a1 =fc.embedding_column(fc.categorical_column_with_identity(key="blinkyE",num_buckets=2,default_value=0),dimension=1)
    b1 =fc.embedding_column(fc.categorical_column_with_identity(key="inkyE",num_buckets=2,default_value=0),1)
    c1 =fc.embedding_column(fc.categorical_column_with_identity(key="pinkyE",num_buckets=2,default_value=0),1)
    d1 =fc.embedding_column(fc.categorical_column_with_identity(key="sueE",num_buckets=2,default_value=0),1)
    e1 =fc.embedding_column(fc.categorical_column_with_identity(key="blinkyDist",num_buckets=5,default_value=0),1)
    f1 =fc.embedding_column(fc.categorical_column_with_identity(key="inkyDist",num_buckets=5,default_value=0),1)
    g1 =fc.embedding_column(fc.categorical_column_with_identity(key="pinkyDist",num_buckets=5,default_value=0),1)
    h1 =fc.embedding_column(fc.categorical_column_with_identity(key="sueDist",num_buckets=5,default_value=0),1)

    categorical_columns = set([a1,b1,c1,d1,e1,f1,g1,h1])
    estimator = tf.estimator.DNNClassifier(feature_columns=categorical_columns,
                                             hidden_units=[24,50,24],
                                             n_classes=6,
                                             model_dir="/tmp/a8_model"
                                            )

    #Fit model
    print("Train: ")
    estimator.train(input_fn=input_fn_train,max_steps=20000)
    print("Fitted! : ")
    print("Evaluate using a test set: ")
    evaluate = estimator.evaluate(input_fn=input_fn_test,steps=1)
    print(evaluate)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # save the model
    feature_spec = tf.feature_column.make_parse_example_spec(categorical_columns)
    tfrecord_serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
    path=estimator.export_savedmodel(export_dir_base="/tmp/a8_model/saved", serving_input_receiver_fn = tfrecord_serving_input_fn, as_text=True)
    print(path)
Exemplo n.º 30
0
def _get_category_column_from_dict(name, kwargs):
    """生成对应的category feature column.

    :param name: 列名
    :param kwargs:
        下面四个参数互斥:
        * num_buckets: int, 最大数值
        * hash_buckets: int, hash桶数
        * vocab_list: list, 候选词列表
        * vocab_file: str, 候选词文件
    :return: feature_column
    """
    convert_methods = {
        "num_buckets", "hash_buckets", "vocab_list", "vocab_file"
    }
    convert_method = set(kwargs.keys()).intersection(convert_methods)

    # 四个参数互斥
    if len(convert_method) > 1:
        raise ValueError("{}: {} cannot coexist."
                         "Please leave only one".format(name, convert_method))
    elif len(convert_method) < 1:
        raise ValueError("{}: argument missing,"
                         "one of {} should be given".format(
                             name, convert_methods))
    else:
        convert_method = list(convert_method)[0]

    if convert_method == "num_buckets":
        # TODO: 检查dtype == int
        num_buckets = kwargs.pop(convert_method)
        assert num_buckets > 0
        # TODO: 如果能用-1更好
        # missing_value用最大值填充
        default_value = num_buckets
        num_buckets += 1

        return fc.categorical_column_with_identity(name, num_buckets,
                                                   default_value)
    elif convert_method == "hash_buckets":
        hash_bucket_size = kwargs.pop(convert_method)
        assert hash_bucket_size > 0
        # TODO: 支持dtype传入
        return fc.categorical_column_with_hash_bucket(name, hash_bucket_size)
    elif convert_method == "vocab_list":
        vocabulary_list = kwargs.pop(convert_method)
        if isinstance(vocabulary_list, str):
            # TODO: 词表支持不同的分隔符
            vocabulary_list = [x.strip() for x in vocabulary_list.split(",")]

        assert len(vocabulary_list) > 1, ("{}: list's length > 0,"
                                          "while get: {}".format(
                                              name, len(vocabulary_list)))

        # TODO: 支持dtype, default_value
        return fc.categorical_column_with_vocabulary_list(
            name, vocabulary_list)
    elif convert_method == "vocab_file":
        # TODO: 支持文件
        raise NotImplementedError("Support later.")
    else:
        raise ValueError("{}: {} is not supported".format(
            name, convert_method))