예제 #1
0
파일: model.py 프로젝트: sxy390/ML-training
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets):
    # Pass-through columns
    transformed = inputs.copy()
    del transformed['pickup_datetime']

    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = Lambda(lambda x: (x + 78) / 8.0,
                                      name='scale_{}'.format(lon_col))(
                                          inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = Lambda(lambda x: (x - 37) / 8.0,
                                      name='scale_{}'.format(lat_col))(
                                          inputs[lat_col])

    # Adding Euclidean dist (no need to be accurate: NN will calibrate it)
    transformed['euclidean'] = Lambda(euclidean, name='euclidean')([
        inputs['pickup_longitude'], inputs['pickup_latitude'],
        inputs['dropoff_longitude'], inputs['dropoff_latitude']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # hour of day from timestamp of form '2010-02-08 09:17:00+00:00'
    transformed['hourofday'] = Lambda(
        lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2),
                                       out_type=tf.dtypes.int32),
        name='hourofday')(inputs['pickup_datetime'])
    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', num_buckets=24))

    latbuckets = np.linspace(0, 1, nbuckets).tolist()
    lonbuckets = np.linspace(0, 1, nbuckets).tolist()
    b_plat = fc.bucketized_column(feature_columns['pickup_latitude'],
                                  latbuckets)
    b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'],
                                  latbuckets)
    b_plon = fc.bucketized_column(feature_columns['pickup_longitude'],
                                  lonbuckets)
    b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'],
                                  lonbuckets)
    ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4)
    feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100)

    return transformed, feature_columns
예제 #2
0
def test_bucketized_column():
    sample = {
        'price': [[5.], [16], [25], [36]],
        'time': [[2.], [6], [8], [15]]
    }
    price_column = feature_column.numeric_column('price')
    bucket_price = feature_column.bucketized_column(price_column,
                                                    [10, 20, 30, 40])
    price_bucket_tensor = feature_column.input_layer(sample, [bucket_price])

    time_column = feature_column.numeric_column('time')
    bucket_time = feature_column.bucketized_column(time_column, [5, 10, 12])
    time_bucket_tensor = feature_column.input_layer(sample, [bucket_time])
    with tf.Session() as session:
        print(session.run([price_bucket_tensor, time_bucket_tensor]))
예제 #3
0
def build_features(statistics):
    pu_location_id = fc.categorical_column_with_identity(key='PULocationID',
                                                         num_buckets=265)
    do_location_id = fc.categorical_column_with_identity(key='DOLocationID',
                                                         num_buckets=265)
    day_of_week = fc.categorical_column_with_identity(key='day_of_week',
                                                      num_buckets=7)
    weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2)
    speed_buckets = fc.bucketized_column(
        fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70])
    distance_buckets = fc.bucketized_column(
        fc.numeric_column('trip_distance'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    duration_buckets = fc.bucketized_column(
        fc.numeric_column('duration'),
        boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])
    fare_buckets = fc.bucketized_column(
        fc.numeric_column('fare_amount'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    passenger_buckets = fc.bucketized_column(
        fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9])
    location = fc.crossed_column([pu_location_id, do_location_id],
                                 hash_bucket_size=1000)
    cross_all = fc.crossed_column([
        location, speed_buckets, distance_buckets, duration_buckets,
        fare_buckets, passenger_buckets
    ],
                                  hash_bucket_size=1000)
    categorical_columns = [
        fc.embedding_column(pu_location_id, dimension=32),
        fc.embedding_column(do_location_id, dimension=32),
        fc.indicator_column(day_of_week),
        fc.indicator_column(weekend)
    ]
    numeric_columns = [
        custom_numeric_column('passenger_count', statistics),
        custom_numeric_column('trip_distance', statistics),
        custom_numeric_column('fare_amount', statistics),
        custom_numeric_column('extra', statistics),
        custom_numeric_column('mta_tax', statistics),
        custom_numeric_column('tolls_amount', statistics),
        custom_numeric_column('improvement_surcharge', statistics),
        custom_numeric_column('duration', statistics),
        custom_numeric_column('speed', statistics)
    ]
    dnn_feature_columns = numeric_columns + categorical_columns
    linear_feature_columns = [location, cross_all]
    return dnn_feature_columns, linear_feature_columns
예제 #4
0
def get_item_feature_columns(business_vocab_list, item_type_dict):

    items_feature_columns = []

    bucketized_boundary = {'stars': [2.5, 4]}
    embedding_size = {"categories": 8, "city": 4}

    for k, v in business_vocab_list.items():

        if k in ['review_count']:
            col = numeric_column(k, default_value=0, dtype=item_type_dict[k])
        elif k in ['stars']:
            col = bucketized_column(
                numeric_column(k, default_value=0, dtype=item_type_dict[k]),
                bucketized_boundary[k])
        elif k in ['categories', 'city']:
            col = embedding_column(categorical_column_with_vocabulary_list(
                k, sorted(v), default_value=-1, dtype=item_type_dict[k]),
                                   dimension=embedding_size[k])
        else:
            col = indicator_column(
                categorical_column_with_vocabulary_list(
                    k, sorted(v), default_value=-1, dtype=item_type_dict[k]))

        items_feature_columns.append(col)

    return items_feature_columns
def define_feature_columns(dataframe):

    print("Defining feature columns...")
    feature_columns = []

    # Create embedding column for name IDs
    name_id = feature_column.categorical_column_with_vocabulary_list(
        'nconst', dataframe.nconst.unique())
    # Dimension set to 30 (approximately fourth root of the number of unique name IDs)
    name_id_embedding = feature_column.embedding_column(name_id, dimension=30)
    feature_columns.append(name_id_embedding)

    # Create indicator columns for category and genres
    indicator_column_names = ['category', 'genres']
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)

    # Create bucketized column for startYear (a.k.a. release date)
    start_year_numeric = feature_column.numeric_column('startYear')
    start_year_bucket = feature_column.bucketized_column(
        start_year_numeric, boundaries=[1927, 1940, 1950, 1960, 1970, 1980, 1990, 1995, 2000, 2005, 2010, 2015])
    feature_columns.append(start_year_bucket)

    print("Feature columns defined")
    return feature_columns
예제 #6
0
 def bucketized_columns(self, columnsBoundaries):
     for key, value in columnsBoundaries.items():
         col = feature_column.numeric_column(key)
         col_buckets = feature_column.bucketized_column(col,
                                                        boundaries=value)
         self.sparse_columns[key] = col_buckets
     return col_buckets
예제 #7
0
    def _get_tf_feature_cols(dataframe: pd.DataFrame):
        feature_columns = []

        # numeric cols
        for header in ['PhotoAmt', 'Fee', 'Age']:
            feature_columns.append(feature_column.numeric_column(header))

        # bucketized cols
        age = feature_column.numeric_column('Age')
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[1, 2, 3, 4, 5])
        feature_columns.append(age_buckets)

        # indicator_columns
        indicator_column_names = [
            'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
            'Vaccinated', 'Sterilized', 'Health'
        ]
        for col_name in indicator_column_names:
            categorical_column = feature_column.categorical_column_with_vocabulary_list(
                col_name, dataframe[col_name].unique())
            indicator_column = feature_column.indicator_column(
                categorical_column)
            feature_columns.append(indicator_column)

        # embedding columns
        breed1 = feature_column.categorical_column_with_vocabulary_list(
            'Breed1', dataframe.Breed1.unique())
        breed1_embedding = feature_column.embedding_column(breed1, dimension=8)
        feature_columns.append(breed1_embedding)
        return feature_columns
예제 #8
0
  def _prepare_for_crossing(self, key_name, num_bck, boundaries):
    """Prepares features for crossing.

    Whether they're continuous or categorical matters, and
    whether we have the whole dictionary or not.

    Args:
      key_name: A string representing the name of the feature
      num_bck: How many buckets to use when we know # of distinct values
      boundaries: Range used for boundaries when bucketinizing
    Returns:
      key name
    """
    key = None
    if key_name in self.continuous.keys():
      if boundaries is not None:
        # Note that cont[key_name] is a source column
        key = tfc.bucketized_column(self.continuous[key_name], boundaries)
      else:
        # We can count all the values in the dataset. Ex: boolean.
        # Note that key_name is a string
        key = tfc.categorical_column_with_identity(key_name, num_bck)
    elif key_name in self.categorical.keys():
      # It is also possible to use the categorical column instead of the
      # column name. i.e key = cat[key_name]
      key = key_name
    else:
      key = key_name

    return key
 def _build_census_wide_columns(numeric_range=None):
     base_columns, cross_columns = [], []
     for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']:
         base_columns.append(
             fc.indicator_column(
                 fc.categorical_column_with_hash_bucket(
                     col,
                     hash_bucket_size=1000 if
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000)))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']:
         base_columns.append(
             fc.bucketized_column(fc.numeric_column(col),
                                  boundaries=list(
                                      np.linspace(numeric_range[col][0],
                                                  numeric_range[col][1],
                                                  1000))))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']:
         cross_columns.append(
             fc.indicator_column(
                 fc.crossed_column([col[0], col[1]],
                                   hash_bucket_size=10000)))
     feature_columns = base_columns + cross_columns
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
예제 #10
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
예제 #11
0
def _add_bucketed_columns(columns, features, feature_table, vocabulary):
    for f in features:
        assert f in feature_table
        # 如果是fixed_len的list特征
        if feature_table[f].feature_spec.is_list and feature_table[
                f].feature_spec.fixed:
            size = feature_table[f].feature_spec.size
            if feature_table[f].feature_spec.dtype == "int":
                numeric_col = fc.numeric_column(f,
                                                shape=(size, ),
                                                dtype=tf.int64,
                                                default_value=0)
            else:
                numeric_col = fc.numeric_column(f,
                                                shape=(size, ),
                                                default_value=0)
        # 如果不是list特征
        else:
            if feature_table[f].feature_spec.dtype == "int":
                numeric_col = fc.numeric_column(f,
                                                dtype=tf.int64,
                                                default_value=0)
            else:
                numeric_col = fc.numeric_column(f, default_value=0)
        bucketed_col = fc.bucketized_column(numeric_col,
                                            boundaries=BUCKET_BOUNDARIES[f])
        embedding_col = fc.embedding_column(bucketed_col,
                                            feature_table[f].emb_width,
                                            combiner='sqrtn')
        columns.append(embedding_col)
def test_bucketized_column():
    price = {'price': [[5.], [15.], [25.], [35.]]}  # 4行样本 shape =[4,1]
    price_column = feature_column.numeric_column('price')
    bucket_price = feature_column.bucketized_column(price_column,
                                                    [10, 20, 30, 40])
    price_bucket_tensor = feature_column.input_layer(price, [bucket_price])
    with tf.Session() as session:
        print(session.run([price_bucket_tensor]))
예제 #13
0
def pratise():
    d = {'x': [[32], [16], [38], [98]]}
    cd = feature_column.numeric_column('x')
    bcd = feature_column.bucketized_column(cd, [10, 20, 40, 60])
    fcd = feature_column.input_layer(d, [bcd])

    with tf.Session() as sess:
        print(sess.run(fcd))
예제 #14
0
    def data_preprocessing(self):
        """
        batch_size = 5  # 예제를 위해 작은 배치 크기를 사용합니다.
        train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size)
        test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)

        for feature_batch, label_batch in train_ds.take(1):
            print('전체 특성:', list(feature_batch.keys()))
            print('나이 특성의 배치:', feature_batch['age'])
            print('타깃의 배치:', label_batch)

        # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
        self.example_batch = next(iter(train_ds))[0]

        age = feature_column.numeric_column("age")
        self.demo(age)
        """
        feature_columns = []

        # 수치형 열
        for header in [
                'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
        ]:
            feature_columns.append(feature_column.numeric_column(header))

        # 버킷형 열
        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
        feature_columns.append(age_buckets)

        # 범주형 열
        thal = feature_column.categorical_column_with_vocabulary_list(
            'thal', ['fixed', 'normal', 'reversible'])
        thal_one_hot = feature_column.indicator_column(thal)
        feature_columns.append(thal_one_hot)

        # 임베딩 열
        thal_embedding = feature_column.embedding_column(thal, dimension=8)
        feature_columns.append(thal_embedding)

        # 교차 특성 열
        crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                        hash_bucket_size=1000)
        crossed_feature = feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature)

        self.feature_layer = layers.DenseFeatures(feature_columns)

        batch_size = 32
        self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        self.val_ds = self.df_to_dataset(self.val,
                                         shuffle=False,
                                         batch_size=batch_size)
        self.test_ds = self.df_to_dataset(self.test,
                                          shuffle=False,
                                          batch_size=batch_size)
예제 #15
0
def create_feature_layer(df):
    week = feature_column.numeric_column("Week")
    boundaries = []
    for i in range(1, 53):
        boundaries.append(i)
    week = feature_column.bucketized_column(week, boundaries=boundaries)
    day = feature_column.numeric_column("Day")
    boundaries = []
    for i in range(1, 8):
        boundaries.append(i)
    day = feature_column.bucketized_column(day, boundaries=boundaries)
    year = feature_column.numeric_column("Year")
    boundaries = []
    for i in range(2013, 2017):
        boundaries.append(i)
    year = feature_column.bucketized_column(year, boundaries=boundaries)
    hour = feature_column.numeric_column("std_hour")
    boundaries = []
    for i in range(0, 24):
        boundaries.append(i)
    hour = feature_column.bucketized_column(hour, boundaries=boundaries)
    arrival = feature_column.categorical_column_with_vocabulary_list(
        "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist())
    airline = feature_column.categorical_column_with_vocabulary_list(
        "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist())
    flight_no = feature_column.categorical_column_with_vocabulary_list(
        "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist())
    arrival_one_hot = feature_column.indicator_column(arrival)
    airline_one_hot = feature_column.indicator_column(airline)
    flight_no_one_hot = feature_column.indicator_column(flight_no)
    arrival_length = len(pd.Series.unique(df.Arrival).tolist())
    arrival_and_week = feature_column.crossed_column(
        [arrival, week], hash_bucket_size=(arrival_length * 52))
    arrival_and_week = feature_column.indicator_column(arrival_and_week)
    airline_length = len(pd.Series.unique(df.Airline).tolist())
    year_and_airline = feature_column.crossed_column(
        [year, airline], hash_bucket_size=(airline_length * 4))
    year_and_airline = feature_column.indicator_column(year_and_airline)
    feature_columns = []
    feature_columns = feature_columns + [
        week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour,
        arrival_and_week, year, year_and_airline
    ]
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    return feature_layer
예제 #16
0
def test_bucketized_column():
    # 1. Input features
    price = {'price': [[15.], [5.], [35.], [25.]]}
    # 2. Feature columns (Dense)
    price_column = feature_column.numeric_column('price')
    # 2. Feature columns (Dense): bucketized_column is both Dense and
    # Categorical
    bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30])
    # 3. Feature tensor
    price_bucket_tensor = feature_column.input_layer(price, [bucket_price])
    with tf.Session() as session:
        print(session.run([price_bucket_tensor]))
예제 #17
0
def get_feature_columns(dataframe):
    """Creates feature columns from pd.DataFrame."""
    feature_columns = []
    feature_layer_inputs = {}

    # numeric cols
    for col_name in ['PhotoAmt', 'Fee', 'Age']:
        feature_columns.append(feature_column.numeric_column(col_name))
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name)

    # bucketized cols
    age = feature_column.numeric_column('Age')
    age_buckets = feature_column.bucketized_column(age,
                                                   boundaries=[1, 2, 3, 4, 5])
    feature_columns.append(age_buckets)

    # indicator_columns
    indicator_column_names = [
        'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
        'Vaccinated', 'Sterilized', 'Health'
    ]
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name,
                                                        dtype=tf.string)

    # embedding columns
    breed1 = feature_column.categorical_column_with_vocabulary_list(
        'Breed1', dataframe.Breed1.unique())
    breed1_embedding = feature_column.embedding_column(breed1, dimension=16)
    feature_columns.append(breed1_embedding)
    feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ),
                                                    name='Breed1',
                                                    dtype=tf.string)

    # crossed columns
    animal_type = feature_column.categorical_column_with_vocabulary_list(
        'Type', ['Cat', 'Dog'])
    feature_columns.append(feature_column.indicator_column(animal_type))
    age_type_feature = feature_column.crossed_column(
        [age_buckets, animal_type], hash_bucket_size=100)
    feature_columns.append(feature_column.indicator_column(age_type_feature))
    feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ),
                                                  name='Type',
                                                  dtype=tf.string)

    return feature_columns, feature_layer_inputs
def test_elasticsearch_io_dataset_training():
    """Test the functionality of the ElasticsearchIODataset by training a
    tf.keras model on the structured data.
    """

    BATCH_SIZE = 2
    dataset = tfio.experimental.elasticsearch.ElasticsearchIODataset(
        nodes=[NODE], index=INDEX, doc_type=DOC_TYPE, headers=HEADERS)
    dataset = dataset.map(lambda v: (v, v.pop("survived")))
    dataset = dataset.batch(BATCH_SIZE)

    assert issubclass(type(dataset), tf.data.Dataset)

    feature_columns = []

    # Numeric column
    fare_column = feature_column.numeric_column("fare")
    feature_columns.append(fare_column)

    # Bucketized column
    age = feature_column.numeric_column("age")
    age_buckets = feature_column.bucketized_column(age, boundaries=[10, 30])
    feature_columns.append(age_buckets)

    # Categorical column
    gender = feature_column.categorical_column_with_vocabulary_list(
        "gender", ["Male", "Female"])
    gender_indicator = feature_column.indicator_column(gender)
    feature_columns.append(gender_indicator)

    # Convert the feature columns into a tf.keras layer
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    # Build the model
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(1),
    ])

    # Compile the model
    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )

    # train the model
    model.fit(dataset, epochs=5)
예제 #19
0
def test_train_model():
    """Test the dataset by training a tf.keras model"""

    dataset = tfio.experimental.mongodb.MongoDBIODataset(uri=URI,
                                                         database=DATABASE,
                                                         collection=COLLECTION)
    dataset = dataset.map(
        lambda x: tfio.experimental.serialization.decode_json(x, specs=SPECS))
    dataset = dataset.map(lambda v: (v, v.pop("survived")))
    dataset = dataset.batch(BATCH_SIZE)

    assert issubclass(type(dataset), tf.data.Dataset)

    feature_columns = []

    # Numeric column
    fare_column = feature_column.numeric_column("fare")
    feature_columns.append(fare_column)

    # Bucketized column
    age = feature_column.numeric_column("age")
    age_buckets = feature_column.bucketized_column(age, boundaries=[10, 30])
    feature_columns.append(age_buckets)

    # Categorical column
    gender = feature_column.categorical_column_with_vocabulary_list(
        "gender", ["Male", "Female"])
    gender_indicator = feature_column.indicator_column(gender)
    feature_columns.append(gender_indicator)

    # Convert the feature columns into a tf.keras layer
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    # Build the model
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation="relu"),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(1),
    ])

    # Compile the model
    model.compile(
        optimizer="adam",
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )

    # train the model
    model.fit(dataset, epochs=5)
예제 #20
0
def build_model(genres, traits, num_cat=10):
    """
    :param genres: list of genre names
    :param traits: list of personality trait names
    :param num_cat: int, number of categories for classification
    :return:
    """
    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    # load training set data
    # train, val, test = split_data(training_set)

    # create a small batch for test purposes prior to build
    # train_ds_demo = df_to_dataset(train, batch_size=5)

    # for feature_batch, label_batch in train_ds_demo.take(1):
    #     print('Every feature:', list(feature_batch.keys()))
    #     print('A batch of Openness:', feature_batch['Openness'])
    #     print('A batch of BinProbs:', label_batch)

    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    # build model features

    # create bucketized trait columns

    # create personality trait columns
    bounds = list(np.linspace(0, 100, 51))
    trait_buckets = []
    for trait in traits:
        trait_feat = feature_column.numeric_column(trait)
        bucketized_feat = feature_column.bucketized_column(trait_feat,
                                                           boundaries=bounds)
        trait_buckets.append(bucketized_feat)

    # create categorical genre columns
    genre_feat = feature_column.categorical_column_with_vocabulary_list(
        'genreName', genres)
    genre_one_hot = feature_column.indicator_column(genre_feat)

    feature_columns = trait_buckets + [genre_one_hot]
    feature_layer = DenseFeatures(feature_columns)
    model = tf.keras.Sequential([
        feature_layer,
        Dense(128, activation='relu'),
        Dense(128, activation='relu'),
        Dense(num_cat)
    ])

    model = compile_model(model)

    return model
예제 #21
0
    def __init__(self, name, params):
        super(MLP, self).__init__()
        self.model_name = name
        self.params = params
        num_features = [
            feature_column.bucketized_column(
                feature_column.numeric_column(str(i)),
                boundaries=[
                    j / (num_bin_size[i] - 1)
                    for j in range(num_bin_size[i] - 1)
                ]) for i in range(8)
        ]
        if name == "MLP_FSIW":
            print("using elapse feature")
            num_features.append(feature_column.numeric_column("elapse"))
        cate_features = [
            feature_column.embedding_column(
                feature_column.categorical_column_with_hash_bucket(
                    str(i), hash_bucket_size=cate_bin_size[i - 8]),
                dimension=8) for i in range(8, 17)
        ]

        all_features = num_features + cate_features

        self.feature_layer = tf.keras.layers.DenseFeatures(all_features)

        self.fc1 = layers.Dense(256,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn1 = layers.BatchNormalization()
        self.fc2 = layers.Dense(256,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn2 = layers.BatchNormalization()
        self.fc3 = layers.Dense(128,
                                activation=tf.nn.leaky_relu,
                                kernel_regularizer=regularizers.l2(
                                    params["l2_reg"]))
        self.bn3 = layers.BatchNormalization()
        print("build model {}".format(name))
        if self.model_name == "MLP_EXP_DELAY":
            self.fc4 = layers.Dense(2)
        elif self.model_name == "MLP_tn_dp":
            self.fc4 = layers.Dense(2)
        elif self.model_name in ["MLP_SIG", "MLP_FSIW"]:
            self.fc4 = layers.Dense(1)
        else:
            raise ValueError("model name {} not exist".format(name))
예제 #22
0
def test_bucketized_column():

    data = {
        'price': [[5.], [15.], [25.], [35.]],
        'price2': [[5.], [15.], [25.], [35.]]
    }  # 4行样本

    price_column = feature_column.numeric_column('price')
    price_column2 = feature_column.numeric_column('price2')
    print(price_column)
    bucket_price = feature_column.bucketized_column(price_column,
                                                    [0, 10, 20, 30, 40])
    bucket_price2 = feature_column.bucketized_column(price_column2,
                                                     [0, 10, 20, 30, 40])
    print(bucket_price)

    price_bucket_tensor = feature_column.input_layer(
        data, [bucket_price, bucket_price2])

    print(type(price_bucket_tensor))

    with tf.Session() as session:
        print(session.run([price_bucket_tensor]))
예제 #23
0
def classify_data(batch_size=5):
    from tensorflow import feature_column
    from tensorflow.keras import layers
    from sklearn.model_selection import train_test_split
    URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
    dataframe = pd.read_csv(URL)
    tr, te = train_test_split(dataframe, test_size=0.2)
    tr, va = train_test_split(tr, test_size=0.2)
    print(len(tr), len(va), len(te))

    def df_to_dataset(dataframe, shuffle=True, batch_size=32):
        dataframe, labels = dataframe.copy(), dataframe.pop('target')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size)
        return ds

    tr_ds = df_to_dataset(tr, batch_size=batch_size)
    va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size)
    te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size)
    feature_columns = []
    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))
    age = feature_column.numeric_column('age')
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)
    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    feature_columns.append(feature_column.indicator_column(thal))
    feature_columns.append(feature_column.embedding_column(thal, dimension=8))
    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    feature_columns.append(feature_column.indicator_column(crossed_feature))
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(tr_ds, validation_data=va_ds, epochs=5)
    loss, accuracy = model.evaluate(te_ds)
    print(accuracy)
예제 #24
0
    def create_features_columns(self):
        # 向量类特征
        user_vector = fc.numeric_column(key="user_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector",
                                        shape=(128, ),
                                        default_value=[0.0] * 128,
                                        dtype=tf.float32)

        # 分桶类特征
        age = fc.numeric_column(key="age",
                                shape=(1, ),
                                default_value=[0],
                                dtype=tf.int64)
        age = fc.bucketized_column(
            input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')

        # 分类特征
        city = fc.categorical_column_with_identity(key="city",
                                                   num_buckets=1000,
                                                   default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')

        # hash特征
        device_id = fc.categorical_column_with_hash_bucket(
            key="device_id", hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id,
                                        dimension=32,
                                        combiner='mean')

        item_id = fc.categorical_column_with_hash_bucket(
            key="item_id", hash_bucket_size=10000, dtype=tf.int64)
        item_id = fc.embedding_column(device_id, dimension=32, combiner='mean')

        self.user_columns["user_vector"] = user_vector
        self.user_columns["age"] = age
        self.user_columns["city"] = city
        self.user_columns["device_id"] = device_id
        self.item_columns["item_vector"] = item_vector
        self.item_columns["item_id"] = item_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(
            self.user_columns.values() + self.item_columns.values())

        return self
예제 #25
0
파일: adult_wd.py 프로젝트: watsonjiang/tf
def build_feature_columns():
    age = feature_column.numeric_column('age')
    age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass',
                ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 
                'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']))
    fnlwgt = feature_column.numeric_column('fnlwgt')
    education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education',
                ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
                 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',
                 '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']))
    education_num = feature_column.numeric_column('education_num')
    marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status',
                ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                 'Married-spouse-absent', 'Married-AF-spouse']))
    occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation',
                ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
                 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 
                 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 
                 'Armed-Forces']))
    relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship',
                ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']))
    race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', 
                ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']))
    gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', 
                ['Female', 'Male']))    
    capital_gain = feature_column.numeric_column('capital_gain') 
    capital_loss = feature_column.numeric_column('capital_loss')
    hours_per_week = feature_column.numeric_column('hours_per_week')
    native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country',
                ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 
                 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam',
                 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador',
                 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland',
                 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 
                 'Holand-Netherlands']))
    wide = [age, workclass]    
    deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country]
    race_gender = feature_column.indicator_column(feature_column.crossed_column([
        feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']),
        feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10))

    wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender]    
    deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country]
    return (wide, deep)
예제 #26
0
def create_feature_columns(note_emb_size=10, note_user_emb_size=6):
    # 先创建分类列

    creator_ids = fc.categorical_column_with_hash_bucket("last_note_creators",
                                                         hash_bucket_size=2000,
                                                         dtype=tf.string)
    note_ids = fc.categorical_column_with_hash_bucket("last_note_ids",
                                                      20000,
                                                      dtype=tf.int64)

    creator_id = fc.categorical_column_with_hash_bucket("note_open_id", 2000)
    note_id = fc.categorical_column_with_hash_bucket("note_id",
                                                     20000,
                                                     dtype=tf.int64)

    video_duration = fc.numeric_column("note_video_duration")
    video_duration_bucket = fc.bucketized_column(source_column=video_duration,
                                                 boundaries=[5, 10, 30, 60])

    note_emb = fc.shared_embedding_columns([note_ids, note_id],
                                           note_emb_size,
                                           combiner='sum')
    creator_emb = fc.shared_embedding_columns([creator_ids, creator_id],
                                              note_user_emb_size,
                                              combiner='sum')

    # phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    # phoneBrand = fc.embedding_column(phoneBrandId, 20)
    # phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500)
    # phoneResolution = fc.embedding_column(phoneResolutionId, 10)
    # phoneOs = fc.indicator_column(fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0))
    # gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0))
    # city_id = fc.categorical_column_with_hash_bucket("city", 700)
    # city = fc.embedding_column(city_id, 16)
    # hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0))

    my_feature_columns = note_emb + creator_emb + [video_duration_bucket]
    print("*" * 100)
    print("feature columns:")
    for i in my_feature_columns:
        print(i)
    print("*" * 100)
    return my_feature_columns
 def _build_census_deep_columns(emb_dim=8, numeric_range=None):
     feature_columns = []
     for col in ALI_DISPLAY_ADS_CONFIG['deep_emb_cols']:
         feature_columns.append(
             fc.embedding_column(fc.categorical_column_with_hash_bucket(
                 col,
                 hash_bucket_size=1000
                 if ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                 ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000),
                                 dimension=emb_dim))
     for col in ALI_DISPLAY_ADS_CONFIG['deep_bucket_emb_cols']:
         feature_columns.append(
             fc.embedding_column(fc.bucketized_column(
                 fc.numeric_column(col),
                 boundaries=list(
                     np.linspace(numeric_range[col][0],
                                 numeric_range[col][1], 1000))),
                                 dimension=emb_dim))
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
예제 #28
0
def feature_json_parse():
    feature_json = open('test.json', 'r').read()
    feature_json = demjson.decode(feature_json)

    feature_columns = []
    for feature_line in feature_json['tensorTransform']:
        feature_type_name = feature_line['name']
        feature_para = feature_line['parameters']

        if feature_type_name == 'NumericColumn':
            feature_columns.append(
                feature_column.numeric_column(feature_para['input_tensor']))
        elif feature_type_name == 'BucketizedColumn':
            feature = feature_column.numeric_column(
                feature_para['input_tensor'])
            feature_columns.append(
                feature_column.bucketized_column(
                    feature, boundaries=feature_para['boundaries']))
        else:
            print(feature_type_name)
def build_model():
    feature_columns = []

    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))

    age = feature_column.numeric_column("age")
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)

    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    thal_one_hot = feature_column.indicator_column(thal)
    feature_columns.append(thal_one_hot)

    thal_embedding = feature_column.embedding_column(thal, dimension=8)
    feature_columns.append(thal_embedding)

    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    crossed_feature = feature_column.indicator_column(crossed_feature)
    feature_columns.append(crossed_feature)

    feature_layer = keras.layers.DenseFeatures(feature_columns)

    model = tf.keras.Sequential([
        feature_layer,
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'],
                  run_eagerly=True)

    return model
예제 #30
0
 def transform(self, output_tensors):
     input_tensor_name = self.parameters.get("input_tensor")
     output_tensor_name = self.parameters.get("output_tensor")
     if self.parameters.has_key("boundaries"):
         boundaries = self.parameters.get("boundaries")
         if not isinstance(boundaries, list):
             boundaries = str(boundaries).replace(' ', '')
             pattern = re.compile('np.linspace\(([0-9]+\.[0-9]+),([0-9]+\.[0-9]+),([0-9]+\.[0-9]+)\)')
             result = pattern.findall(boundaries)
             boundaries = list(np.linspace(float(result[0][0]),
                                           float(result[0][1]),
                                           float(result[0][2])))
     else:
         msg = "parameters error, sparse_column_with_keys must need keys"
         logger.error(msg)
         raise ParametersError(msg)
     print("input_tensor_name:",input_tensor_name)
     input_tensor = output_tensors.get(input_tensor_name)
     output_tensors[output_tensor_name] = fc.bucketized_column(
         source_column=input_tensor,
         boundaries=boundaries)