示例#1
0
def main():
    URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
    dataframe = pd.read_csv(URL)
    print(dataframe.head())

    train, test = train_test_split(dataframe, test_size=0.2)
    train, val = train_test_split(train, test_size=0.2)
    print(len(train), 'train examples')
    print(len(val), 'validation examples')
    print(len(test), 'test examples')

    batch_size = 5  # 小批量大小用于演示
    train_ds = df_to_dataset(train, batch_size=batch_size)
    val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
    test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

    for feature_batch, label_batch in train_ds.take(1):
        print('Every feature:', list(feature_batch.keys()))
        print('A batch of ages:', feature_batch['age'])
        print('A batch of targets:', label_batch)

    # 我们将使用该批数据演示几种特征列
    example_batch = next(iter(train_ds))[0]

    # 用于创建一个特征列
    # 并转换一批次数据的一个实用程序方法
    def demo(feature_column):
        feature_layer = layers.DenseFeatures(feature_column)
        print(feature_layer(example_batch).numpy())

    age = feature_column.numeric_column("age")
    demo(age)

    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    demo(age_buckets)

    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])

    thal_one_hot = feature_column.indicator_column(thal)
    demo(thal_one_hot)

    # 注意到嵌入列的输入是我们之前创建的类别列
    thal_embedding = feature_column.embedding_column(thal, dimension=8)
    demo(thal_embedding)

    thal_hashed = feature_column.categorical_column_with_hash_bucket(
        'thal', hash_bucket_size=1000)
    demo(feature_column.indicator_column(thal_hashed))

    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    demo(feature_column.indicator_column(crossed_feature))

    feature_columns = []

    # 数值列
    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))

    # 分桶列
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)

    # 分类列
    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    thal_one_hot = feature_column.indicator_column(thal)
    feature_columns.append(thal_one_hot)

    # 嵌入列
    thal_embedding = feature_column.embedding_column(thal, dimension=8)
    feature_columns.append(thal_embedding)

    # 组合列
    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    crossed_feature = feature_column.indicator_column(crossed_feature)
    feature_columns.append(crossed_feature)

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    batch_size = 32
    train_ds = df_to_dataset(train, batch_size=batch_size)
    val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
    test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'],
                  run_eagerly=True)

    model.fit(train_ds, validation_data=val_ds, epochs=5)

    loss, accuracy = model.evaluate(test_ds)
    print("Accuracy", accuracy)
示例#2
0

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('species')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


feature_columns = []

for header in ['sl', 'sw', 'pl', 'pw']:
    feature_columns.append(feature_column.numeric_column(header))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
示例#3
0
def create_feature_columns():
    # user feature
    gender = fc.indicator_column(
        fc.categorical_column_with_identity("gender",
                                            num_buckets=3,
                                            default_value=0))
    age_class = fc.indicator_column(
        fc.categorical_column_with_identity("age_class",
                                            num_buckets=7,
                                            default_value=0))
    has_baby = fc.indicator_column(
        fc.categorical_column_with_identity("has_baby",
                                            num_buckets=2,
                                            default_value=0))
    baby_gender = fc.indicator_column(
        fc.categorical_column_with_identity("baby_gender",
                                            num_buckets=3,
                                            default_value=0))
    baby_age = fc.indicator_column(
        fc.categorical_column_with_identity("baby_age",
                                            num_buckets=7,
                                            default_value=0))
    grade = fc.indicator_column(
        fc.categorical_column_with_identity("grade",
                                            num_buckets=7,
                                            default_value=0))
    rfm_type = fc.indicator_column(
        fc.categorical_column_with_identity("bi_rfm_type",
                                            num_buckets=12,
                                            default_value=0))
    cate1_price_prefer = fc.indicator_column(
        fc.categorical_column_with_identity("cate1_price_prefer",
                                            num_buckets=6,
                                            default_value=0))
    cate2_price_prefer = fc.indicator_column(
        fc.categorical_column_with_identity("cate2_price_prefer",
                                            num_buckets=6,
                                            default_value=0))
    cate3_price_prefer = fc.indicator_column(
        fc.categorical_column_with_identity("cate3_price_prefer",
                                            num_buckets=6,
                                            default_value=0))
    city_id = fc.categorical_column_with_hash_bucket("city", 700)
    city = fc.embedding_column(city_id, 16)
    shop_visit_cnt = fc.indicator_column(
        fc.categorical_column_with_identity("shop_visit_cnt_rank",
                                            num_buckets=20,
                                            default_value=19))
    shop_visit_usr = fc.indicator_column(
        fc.categorical_column_with_identity("shop_visit_usr_rank",
                                            num_buckets=20,
                                            default_value=19))

    # item feature
    c2id = fc.categorical_column_with_hash_bucket("cate2Id",
                                                  10000,
                                                  dtype=tf.int64)
    c2id_embed = fc.embedding_column(c2id, 32)
    modified_time = fc.numeric_column("modified_time", default_value=0.0)
    modified_time_sqrt = fc.numeric_column("modified_time_sqrt",
                                           default_value=0.0)
    modified_time_square = fc.numeric_column("modified_time_square",
                                             default_value=0.0)
    props_sex = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("props_sex",
                                                   ["男", "女", "通用", "情侣"],
                                                   default_value=0))
    brand_grade = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list(
            "brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
    shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
    shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
    ipv_ntile = fc.bucketized_column(
        fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    pay_ntile = fc.bucketized_column(
        fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    price = fc.numeric_column("price_norm", default_value=0.0)
    ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
    cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
    uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
    ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
    cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
    uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
    ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
    cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
    uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
    ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
    cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
    uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
    pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
    pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
    pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
    pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
    cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
    cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
    brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
    slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
    slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
    slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d",
                                           default_value=0.0)
    slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w",
                                           default_value=0.0)
    slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w",
                                           default_value=0.0)
    slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m",
                                           default_value=0.0)
    weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
    cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv",
                                          default_value=0.0)
    cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv",
                                          default_value=0.0)
    slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
    brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
    cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
    cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)

    # context feature
    matchScore = fc.numeric_column("matchScore", default_value=0.0)
    popScore = fc.numeric_column("popScore", default_value=0.0)
    brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
    cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
    catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
    sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
    matchType = fc.indicator_column(
        fc.categorical_column_with_identity("matchType", 9, default_value=0))
    position = fc.bucketized_column(fc.numeric_column("position",
                                                      dtype=tf.int64,
                                                      default_value=301),
                                    boundaries=[
                                        1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30,
                                        40, 50, 80, 100, 150, 200, 300
                                    ])
    triggerNum = fc.indicator_column(
        fc.categorical_column_with_identity("triggerNum", 41,
                                            default_value=40))
    triggerRank = fc.indicator_column(
        fc.categorical_column_with_identity("triggerRank",
                                            41,
                                            default_value=40))
    sceneType = fc.indicator_column(
        fc.categorical_column_with_identity("type", 2, default_value=0))
    hour = fc.indicator_column(
        fc.categorical_column_with_identity("hour", 24, default_value=0))
    phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    phoneResolutionId = fc.categorical_column_with_hash_bucket(
        "phoneResolution", 500)
    phoneBrand = fc.embedding_column(phoneBrandId, 20)
    phoneResolution = fc.embedding_column(phoneResolutionId, 10)
    phoneOs = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("phoneOs",
                                                   ["android", "ios"],
                                                   default_value=0))

    global my_feature_columns
    my_feature_columns = [
        matchScore, matchType, position, triggerNum, triggerRank, sceneType,
        hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer,
        brandPrefer, cate2Prefer, catePrefer, gender, age_class, has_baby,
        baby_gender, baby_age, grade, rfm_type, city, price, props_sex,
        brand_grade, cate1_price_prefer, cate2_price_prefer,
        cate3_price_prefer, modified_time, modified_time_sqrt,
        modified_time_square, shipment_rate, shipping_rate, ipv_ntile,
        pay_ntile, shop_visit_cnt, shop_visit_usr, c2id_embed, uv_cvr_1d,
        uv_cvr_1w, uv_cvr_2w, uv_cvr_1m, ctr_1d, ctr_1w, ctr_2w, ctr_1m,
        cvr_1d, cvr_1w, cvr_2w, cvr_1m, pay_qty_1d, pay_qty_1w, pay_qty_2w,
        pay_qty_1m, cat2_pay_qty, cat1_pay_qty, brd_pay_qty, slr_pay_qty_1d,
        slr_pay_qty_1w, slr_pay_qty_2w, slr_pay_qty_1m, slr_brd_pay_qty_1d,
        slr_brd_pay_qty_1w, slr_brd_pay_qty_2w, slr_brd_pay_qty_1m,
        weighted_ipv, cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv,
        brd_weighted_ipv, cms_scale, cms_scale_sqrt
    ]
    return my_feature_columns
def transform_from_code_gen(source_inputs):
    education_hash_fc = fc.categorical_column_with_hash_bucket(
        "education", hash_bucket_size=education_hash.hash_bucket_size)

    occupation_hash_fc = fc.categorical_column_with_hash_bucket(
        "occupation", hash_bucket_size=occupation_hash.hash_bucket_size)

    native_country_hash_fc = fc.categorical_column_with_hash_bucket(
        "native_country",
        hash_bucket_size=native_country_hash.hash_bucket_size)

    workclass_lookup_fc = fc.categorical_column_with_vocabulary_list(
        "workclass", vocabulary_list=workclass_lookup.vocabulary_list)

    marital_status_lookup_fc = fc.categorical_column_with_vocabulary_list(
        "marital_status",
        vocabulary_list=marital_status_lookup.vocabulary_list)

    relationship_lookup_fc = fc.categorical_column_with_vocabulary_list(
        "relationship", vocabulary_list=relationship_lookup.vocabulary_list)

    race_lookup_fc = fc.categorical_column_with_vocabulary_list(
        "race", vocabulary_list=race_lookup.vocabulary_list)

    sex_lookup_fc = fc.categorical_column_with_vocabulary_list(
        "sex", vocabulary_list=sex_lookup.vocabulary_list)

    age_bucketize_fc = fc.bucketized_column(
        fc.numeric_column("age"), boundaries=age_bucketize.boundaries)

    capital_gain_bucketize_fc = fc.bucketized_column(
        fc.numeric_column("capital_gain"),
        boundaries=capital_gain_bucketize.boundaries,
    )

    capital_loss_bucketize_fc = fc.bucketized_column(
        fc.numeric_column("capital_loss"),
        boundaries=capital_loss_bucketize.boundaries,
    )

    hours_per_week_bucketize_fc = fc.bucketized_column(
        fc.numeric_column("hours_per_week"),
        boundaries=hours_per_week_bucketize.boundaries,
    )

    group1_fc = edl_fc.concatenated_categorical_column(categorical_columns=[
        workclass_lookup_fc,
        hours_per_week_bucketize_fc,
        capital_gain_bucketize_fc,
        capital_loss_bucketize_fc,
    ])

    group2_fc = edl_fc.concatenated_categorical_column(categorical_columns=[
        education_hash_fc,
        marital_status_lookup_fc,
        relationship_lookup_fc,
        occupation_hash_fc,
    ])

    group3_fc = edl_fc.concatenated_categorical_column(categorical_columns=[
        age_bucketize_fc,
        sex_lookup_fc,
        race_lookup_fc,
        native_country_hash_fc,
    ])

    group1_wide_embedding_fc = fc.embedding_column(
        group1_fc,
        dimension=group1_embedding_wide.output_dim,
    )

    group2_wide_embedding_fc = fc.embedding_column(
        group2_fc,
        dimension=group2_embedding_wide.output_dim,
    )

    group1_deep_embedding_fc = fc.embedding_column(
        group1_fc,
        dimension=group1_embedding_deep.output_dim,
    )

    group2_deep_embedding_fc = fc.embedding_column(
        group2_fc,
        dimension=group2_embedding_deep.output_dim,
    )

    group3_deep_embedding_fc = fc.embedding_column(
        group3_fc,
        dimension=group3_embedding_deep.output_dim,
    )

    wide_feature_columns = [
        group1_wide_embedding_fc,
        group2_wide_embedding_fc,
    ]

    deep_feature_columns = [
        group1_deep_embedding_fc,
        group2_deep_embedding_fc,
        group3_deep_embedding_fc,
    ]

    return (
        tf.keras.layers.DenseFeatures(wide_feature_columns)(source_inputs),
        tf.keras.layers.DenseFeatures(deep_feature_columns)(source_inputs),
    )
                               label_key='charged_off',
                               num_epochs=5,
                               shuffle=True,
                               batch_size=20000)  #300000 #230934
#val_inpf = functools.partial(easy_input_function, val_df, label_key='charged_off', num_epochs=1, shuffle=False, batch_size=val_df.shape[0]) #200000
test_inpf = functools.partial(easy_input_function,
                              test_df,
                              label_key='charged_off',
                              num_epochs=1,
                              shuffle=False,
                              batch_size=test_df.shape[0])  #200000
###################################################################

#DEFINE ALL NUMERIC COLUMNS

loan_amnt = fc.numeric_column('loan_amnt')
term = fc.numeric_column('term')
installment = fc.numeric_column('installment')
emp_length = fc.numeric_column('emp_length')
dti = fc.numeric_column('dti')
earliest_cr_line = fc.numeric_column('earliest_cr_line')
open_acc = fc.numeric_column('open_acc')
pub_rec = fc.numeric_column('pub_rec')
revol_util = fc.numeric_column('revol_util')
total_acc = fc.numeric_column('total_acc')
mort_acc = fc.numeric_column('mort_acc')
pub_rec_bankruptcies = fc.numeric_column('pub_rec_bankruptcies')
log_annual_inc = fc.numeric_column('log_annual_inc')
fico_score = fc.numeric_column('fico_score')
log_revol_bal = fc.numeric_column('log_revol_bal')
示例#6
0
if isinstance(x_train, np.ndarray):
    print("data have been loaded as numpy array")

features = [' CRIM', 'ZN', 'INDUS', ' CHAS', ' NOX', ' RM', ' AGE', 'DIS', 'RAD', 'TAX',
            'PTRATIO', 'B', 'LSTAT']

x_train_df = pd.DataFrame(data=x_train, columns=features)
x_test_df = pd.DataFrame(data=x_test, columns=features)
y_train_df = pd.DataFrame(data=y_train, columns=['price'])
y_test_df = pd.DataFrame(data=y_test, columns=['price'])

print(x_train_df.head())

feature_columns = []
for feature_name in features:
    feature_columns.append(feature_column.numeric_column(feature_name, dtype=tf.float32))

"""
Have to create an input pipeline using tf.data
"""


def estimator_input_fn(df_data, df_label, epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(df_data), df_label))
        if shuffle:
            ds = ds.shuffle(100)
        ds = ds.batch(batch_size).repeat(epochs)
        return ds

    return input_function
示例#7
0
    labels_one_hot = to_categorical(labels, num_classes=40)

    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels_one_hot))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


batch_size = 20
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

feature_columns = []
fc_age = feature_column.numeric_column('age')
fc_age_buckets = feature_column.bucketized_column(fc_age, boundaries=[20, 30])
feature_columns.append(fc_age_buckets)

fc_gender = feature_column.categorical_column_with_vocabulary_list(
    'gender', ['Male', 'Female'])
fc_gender_one_hot = feature_column.indicator_column(fc_gender)
feature_columns.append(fc_gender_one_hot)

fc_emotion = feature_column.categorical_column_with_vocabulary_list(
    'emotion', ['Happy', 'Sad', 'Fear', 'Disgust', 'Anger', 'Surprice'])
fc_emotion_one_hot = feature_column.indicator_column(fc_emotion)
feature_columns.append(fc_emotion_one_hot)

fc_color = feature_column.categorical_column_with_vocabulary_list(
    'color', ['Red', 'Blue', 'Green', 'White', 'Gray'])
示例#8
0
    def create_feature_columns(self):
        feature_columns = []

        numeric_cols = [
            'owner_influence', 'is_commented_by_connections', 'is_liked_by_me',
            'is_liked_by_connections', 'poster_gender', 'poster_influence',
            'participant1_gender', 'participant1_influence',
            'participant2_gender', 'participant2_influence',
            'participant3_gender', 'participant3_influence'
        ]

        # numeric cols
        for header in numeric_cols:
            feature_columns.append(feature_column.numeric_column(header))

        # bucketized columns

        # age
        step = len(self.df.age) // 8
        sorted_ages = sorted(self.df.age)
        age_boundaries = [sorted_ages[i * step] for i in range(1, 8)]

        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=age_boundaries)
        feature_columns.append(age_buckets)

        # number_of_likes
        likes_num = feature_column.numeric_column("number_of_likes")
        likes_num_buckets = feature_column.bucketized_column(
            likes_num, boundaries=[2, 5, 10, 20, 50, 100])
        feature_columns.append(likes_num_buckets)

        # number_of_comments
        comments_num = feature_column.numeric_column("number_of_comments")
        comments_num_buckets = feature_column.bucketized_column(
            comments_num, boundaries=[1, 2, 5, 10, 20, 50, 100])
        feature_columns.append(comments_num_buckets)

        # indicator columns for categorical features

        app_type = feature_column.categorical_column_with_vocabulary_list(
            'app_type', self.df.app_type.unique())
        app_type_1hot = feature_column.indicator_column(app_type)
        feature_columns.append(app_type_1hot)

        owner_type = feature_column.categorical_column_with_vocabulary_list(
            'owner_type', self.df.owner_type.unique())
        owner_type_1hot = feature_column.indicator_column(owner_type)
        feature_columns.append(owner_type_1hot)

        poster_focus = feature_column.categorical_column_with_vocabulary_list(
            'poster_focus', [
                'engineering', 'sales', 'marketing', 'management', 'financial',
                'other'
            ])
        poster_focus_1hot = feature_column.indicator_column(poster_focus)
        feature_columns.append(poster_focus_1hot)

        # functions to reduce code duplication
        def participant_action(part_action):
            participant_action = feature_column.categorical_column_with_vocabulary_list(
                part_action, ['commented', 'liked', 'viewed'])
            return participant_action

        def participant_focus(part_f):
            participant_focus = feature_column.categorical_column_with_vocabulary_list(
                part_f, [
                    'engineering', 'sales', 'marketing', 'management',
                    'financial', 'other', 'none'
                ])
            return participant_focus

        participant1_action = participant_action("participant1_action")
        participant2_action = participant_action("participant2_action")
        participant3_action = participant_action("participant3_action")

        participant1_focus = participant_focus("participant1_focus")
        participant2_focus = participant_focus("participant2_focus")
        participant3_focus = participant_focus("participant3_focus")

        feature_columns.append(
            feature_column.indicator_column(participant1_action))
        feature_columns.append(
            feature_column.indicator_column(participant1_focus))
        feature_columns.append(
            feature_column.indicator_column(participant2_action))
        feature_columns.append(
            feature_column.indicator_column(participant2_focus))
        feature_columns.append(
            feature_column.indicator_column(participant3_action))
        feature_columns.append(
            feature_column.indicator_column(participant3_focus))

        # feature crosses for participant action and focus
        crossed_feature1 = feature_column.crossed_column(
            [participant1_action, participant1_focus], hash_bucket_size=1000)
        crossed_feature1 = feature_column.indicator_column(crossed_feature1)
        feature_columns.append(crossed_feature1)

        crossed_feature2 = feature_column.crossed_column(
            [participant2_action, participant2_focus], hash_bucket_size=1000)
        crossed_feature2 = feature_column.indicator_column(crossed_feature2)
        feature_columns.append(crossed_feature2)

        crossed_feature3 = feature_column.crossed_column(
            [participant3_action, participant3_focus], hash_bucket_size=1000)
        crossed_feature3 = feature_column.indicator_column(crossed_feature3)
        feature_columns.append(crossed_feature3)

        self.feature_columns = feature_columns
    print("Every feature:", list(feature_batch.keys()))
    print("A batch of ages", feature_batch["Age"])
    print("A batch of targets", label_batch)

example_batch = next(iter(train_ds))[0]  # 一个批量的数据


def demo(feature_column):  # 显示相关数据  # 进行显示
    feature_layer = layers.DenseFeatures(feature_columns=feature_column)
    print(feature_layer(example_batch).numpy())


feature_columns = []
# ---------------------------------------数值列--------------------------------------------
for header in ["PhotoAmt", "Fee", "Age"]:
    feature_columns.append(feature_column.numeric_column(header))
# 测试
# photo_count = feature_column.numeric_column('PhotoAmt')
# demo(photo_count)

# --------------------------------------分桶列--------------------------------------------------
age = feature_column.numeric_column(key="Age")
age_buckets = feature_column.bucketized_column(source_column=age,
                                               boundaries=[1, 2, 3, 4, 5])
# 测试
# demo(age_buckets)
feature_columns.append(age_buckets)

# --------------------------------------种类列--------------------------------------------------
animal_type = feature_column.categorical_column_with_vocabulary_list(
    key="Type", vocabulary_list=["Cat", "Dog"])
示例#10
0
        if options['distribute']:
            return dataset
        else:
            return dataset.make_one_shot_iterator().get_next()

    return _input_fn


#
# The input layer: See Feature_Engineering.ipynb for explanations
#
from tensorflow.feature_column import numeric_column
from tensorflow.feature_column import crossed_column
from tensorflow.feature_column import indicator_column
from tensorflow.feature_column import categorical_column_with_identity
from tensorflow_transform.tf_metadata import dataset_schema

beta1 = numeric_column('beta1')
beta2 = numeric_column('beta2')

weekday = categorical_column_with_identity('weekday', num_buckets=7)
hour = categorical_column_with_identity('hour', num_buckets=24)
hour_of_week = indicator_column(crossed_column([weekday, hour], 24 * 7))

all_feature_columns = [beta1, beta2, hour_of_week]


def input_layer(features):
    return tf.feature_column.input_layer(features,
                                         feature_columns=all_feature_columns)
示例#11
0
# ax.scatter3D(data_train["Temp"], data_train["QDot"], data_train["HTC"], c='k', marker='x', label="Train")

# ax.set_xlabel("Temp")
# ax.set_ylabel("QDot")
# ax.set_zlabel("HTC")
# plt.legend()
#plt.show()

data_train.head()

#%%
# Feature Columns: As for any other TF estimator, data needs to be passed to the estimator, 
#      which is typically via an input_fn and parsed using FeatureColumns.

feature_columns = [
    fc.numeric_column("Temp"),
    fc.numeric_column("QDot")
]

# creating input_fn: As for any other estimator, you can use an input_fn to feed data to the 
#     model for training and evaluation. TFL estimators automatically calculate quantiles of
#     the features and use them as input keypoints for the PWL calibration layer. To do so, 
#     they require passing a feature_analysis_input_fn, which is similar to the training 
#     input_fn but with a single epoch or a subsample of the data.
train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
    x=data_train[data_train.columns[:2]],
    y=data_train["HTC"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_threads=1
示例#12
0
#demo(feature_column.indicator_column(crossed_feature))

feature_columns = []

#Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
#feature_columns.append(gender_one_hot)

#feature_columns.append(age_buckets)
#feature_columns.append(age)
# numeric cols

for header in numeric_columns:
    print('Printing header')
    print(header)
    feature_columns.append(
        feature_column.numeric_column(header, dtype=tf.float64))

# bucketized cols

age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(
    age,
    boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90])

gender = feature_column.categorical_column_with_vocabulary_list(
    'Gender', ['Male', 'Female'])

gender_one_hot = feature_column.indicator_column(gender)

feature_columns.append(age_buckets)
feature_columns.append(gender_one_hot)
示例#13
0
train_ds = df_to_dataset(train, shuffle=True, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=True, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=True, batch_size=batch_size)

for feature_batch, label_batch in train_ds.take(1):
    print("dnesity feature", feature_batch['density'])

example_batch = next(iter(train_ds))[0]


def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())


dioxide = feature_column.numeric_column('free_sulfur_dioxide')
demo(dioxide)

dioxide_buckets = feature_column.bucketized_column(
    dioxide, boundaries=[2, 6, 10, 14, 18, 24, 30, 50])

demo(dioxide_buckets)

#    fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  alcohol  quality
#0             7.4             0.700         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5

feature_columns = []
for header in [
        'fixed_acidity', 'volatile_acidity', 'residual_sugar', 'chlorides',
        'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH',
        'sulphates', 'alcohol'
示例#14
0
    print('Grupo de objetivos: ', label_batch)

# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]


# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

feature_columns = []

# Creamos diferentes columnas para ver resultados de cada caracteristica
age = feature_column.numeric_column("edad")
demo(age)

age_buckets = feature_column.bucketized_column(age, boundaries=[16, 25, 30, 35, 40, 45, 50])
demo(age_buckets)
feature_columns.append(age_buckets)

genero = feature_column.categorical_column_with_vocabulary_list(
    'genero', ['Hombre', 'Mujer'])

genero_one_hot = feature_column.indicator_column(genero)
demo(genero_one_hot)
feature_columns.append(genero_one_hot)

ubicacion = feature_column.categorical_column_with_vocabulary_list(
    'ubicacion', ['Pueblo', 'Ciudad'])
示例#15
0
def get_compiled_model(headers,targetGroup,denseNum):
  # model = tf.keras.Sequential([
  #   tf.keras.layers.Dense(10, activation='relu'),
  #   tf.keras.layers.Dense(10, activation='relu'),
  #   tf.keras.layers.Dense(1, activation='sigmoid')
  # ])
  # model.compile(optimizer='adam',
  #               loss='binary_crossentropy',
  #               metrics=['accuracy'])
  # model = keras.models.Sequential()
  # for _ in range(20):
  #   model.add(keras.layers.Dense(10, activation="relu"))
  # # model.add(keras.layers.AlphaDropout(rate=0.5))
  #   # AlphaDropout: 1. 均值和方差不变 2. 归一化性质也不变
  # model.add(keras.layers.Dense(1, activation="sigmoid"))
  feature_columns = []
  for header in headers:
      feature_columns.append(feature_column.numeric_column(header))
  feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
  model = keras.models.Sequential()
  # model.add(keras.layers.Flatten(input_shape=(0,1, len(headers))),)
  # model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(,  return_sequences=True)))
  # for _ in range(20):
  #   model.add(keras.layers.Dense(len(headers), activation='relu'))
    # model.add(keras.layers.Dense(10))
  #   # model.add(keras.layers.Dense(10, activation="selu"))
  #   # model.add(keras.layers.Dense(len(headers), activation="relu"))
    # model.add(keras.layers.Dense(len(headers), activation="relu"))
    # model.add(keras.layers.AlphaDropout(rate=0.5))
    # model.add(keras.layers.Dense(10, activation="relu"))
  # model.add(tf.keras.layers.Dense(128,activation='relu'))
  # model.add(tf.keras.layers.Flatten())
    # AlphaDropout: 1. 均值和方差不变 2. 归一化性质也不变
  # model.add(keras.layers.Dense(len(headers), activation='relu'))
  # model.add(keras.layers.Dense(3))
  if len(targetGroup) > 2:
    print('222222222222222222222222222222222222222222')
    denseCount =len(targetGroup)
    # model.add(keras.layers.Dense(denseCount, activation="softmax"))
    # model.add(keras.layers.Flatten())
    # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # model.add(keras.layers.Flatten())
    # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # for _ in range(2):
    #   model.add(keras.layers.Dense(10*len(headers), activation='relu'))
    # for _ in range(2):
    #   model.add(keras.layers.Dense(5*len(headers), activation='relu'))
    # model.add(keras.layers.Dense(10*len(headers), activation='relu'))
    # for _ in range(denseNum):
    #   model.add(keras.layers.Dense(len(headers), activation='relu'))
      
    # # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # model.add(keras.layers.AlphaDropout(rate=0.5))
    # # model.add(keras.layers.Dense(denseCount))
    # model.add(tf.keras.layers.Dense(denseCount, activation='softmax'))
    # model.add(tf.keras.layers.Dense(denseCount))
    # model.compile(optimizer='adam',
    #           loss='sparse_categorical_crossentropy',
    #           metrics=['accuracy'])
    # model.compile(optimizer= tf.keras.optimizers.Adam(),
    #               loss= tf.keras.losses.SparseCategoricalCrossentropy(),
    #              metrics=['accuracy'])
    model = keras.Sequential([
      # keras.layers.Dense(64, activation='relu',input_dim=len(headers)),
      keras.layers.Dense(64, activation='relu',input_dim=len(headers)),
      keras.layers.Dense(64, activation='relu'),
      keras.layers.AlphaDropout(rate=0.5),
      # keras.layers.Dense(len(headers), activation='relu'),
      keras.layers.Dense(denseCount, activation='softmax')
    ])
    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    # model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),
    #               loss=keras.losses.BinaryCrossentropy(),
    #              metrics=[keras.metrics.SparseCategoricalAccuracy()])
    # model.add(tf.keras.layers.Dense(denseCount))
    # model.add(tf.keras.layers.Dense(denseCount))
    # model.add(tf.keras.layers.Dense(denseCount))
    # model.compile(optimizer='adam',
    #              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #              metrics=["accuracy"])
    # model.compile(optimizer= tf.keras.optimizers.Adam(lr=1e-3),
    #               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #             metrics=["accuracy"])
    # model.compile( optimizer=keras.optimizers.Adam(lr=1e-3),
    #               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #             metrics=["accuracy"])
    # model.compile(optimizer=tf.keras.optimizers.Adam(),
    #           loss='sparse_categorical_crossentropy',
    #           metrics=["accuracy"])
    # model.add(keras.layers.Dense(denseCount))
    # model.compile(optimizer='adam',
    #           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    #           metrics=['accuracy'])
  else:
    # for _ in range(denseNum):
    #   model.add(keras.layers.Dense(len(headers), activation='relu'))
    #   # model.add(keras.layers.Dense(10, activation='relu'))
    # # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # # model.add(keras.layers.Dense(len(headers), activation='relu'))
    # model.add(keras.layers.AlphaDropout(rate=0.5))
    # model.add(keras.layers.Dense(1, activation="sigmoid"))
    model = keras.Sequential([
      # keras.layers.Dense(64, activation='relu',input_dim=len(headers)),
      keras.layers.Dense(64, activation='relu',input_dim=len(headers)),
      keras.layers.Dense(64, activation='relu'),
      keras.layers.AlphaDropout(rate=0.5),
      # keras.layers.Dense(len(headers), activation='relu'),
      keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)
  # model.summary()
    # model.compile(
    #           optimizer='adam',
    #           loss='categorical_crossentropy',
    #           metrics=['accuracy'],
    #           run_eagerly=True,
    #           # optimizer='sgd'
    #           )
  # model.add(keras.layers.Dense(denseCount, activation="sigmoid"))
  # 配置 SGD,学习率为 0.1
  # optimizer = tf.keras.optimizers.SGD(0.1)
  # model.compile(optimizer=optimizer,
  #             loss = loss,
  #             metrics=['accuracy'])

  # model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy',metrics = ['accuracy'])
  # model = tf.keras.Sequential([
  #   feature_layer,
  #   keras.layers.Dense(128, activation='relu'),
  #   keras.layers.Dense(128, activation='relu'),
  #   keras.layers.Dense(1, activation='sigmoid')
  # ])
#    layer0 = tf.keras.layers.Dense(class_num, input_shape=(x_data.shape[1],), activation='softmax')
#  model = tf.keras.Sequential([layer0])
#  model.compile(loss='categorical_crossentropy', optimizer='adam')
# #首先先将数据集归一化
# train_image = train_image/255
# test_image = test_image/255
# #创建model
# model = tf.keras.Sequential()
# #添加隐含层
# model.add(tf.keras.layers.Flatten(input_shape=(28,28)))
# model.add(tf.keras.layers.Dense(128,activation = 'relu'))
# model.add(tf.keras.layers.Dense(10,activation = 'softmax'))

# #编译model
# model.compile(optimizer = 'adam' , loss = 'sparse_categorical_crossentropy',metrics = ['acc'])
  # model.compile(loss='categorical_crossentropy', optimizer='adam')
# #训练model
# model.fit(train_image , train_label,epochs = 5)
# #进行model的预测
# model.predict(test_image , test_label)

  # model.compile(
  #               optimizer='adam',
  #               loss='categorical_crossentropy',
  #               metrics=['accuracy'],
  #               run_eagerly=True,
  #               # optimizer='sgd'
  #               )
  # model.compile(optimizer=tf.keras.optimizers.RMSprop(),
  #               loss=tf.keras.losses.SparseCategoricalCrossentropy(),
  #               metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
  # model.compile(optimizer='adam',
  #             loss='binary_crossentropy',
  #             metrics=['accuracy'],
  #             run_eagerly=True)
  return model
示例#16
0
print(numeric_data)
#import pdb; pdb.set_trace()
#demo(numeric_data)
#pdb.set_trace()

print('GENDER============>')
gender  = feature_column.categorical_column_with_vocabulary_list(
      'Gender', ['Male', 'Female'])

gender_one_hot = feature_column.indicator_column(gender)
print('Gender one hot')
print(gender_one_hot)
pdb.set_trace()

print('AGE==============>')
age = feature_column.numeric_column('Age')
print("Demo AGE")
demo(age)

age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90])
print(age_buckets)
pdb.set_trace()


numeric_data['Age_Buckets']=  pd.DataFrame(age_buckets)
numeric_data['Gender'] = pd.DataFrame(gender_one_hot)
print('DEBUG==========>')
print(numeric_data)
print(age_buckets_df)
print('Gender one hot')
print(gender_one_hot)
示例#17
0
def main(_):
    # Parse configs updates from command line flags.
    config_updates = []
    for update in FLAGS.config_updates:
        config_updates.extend(re.findall(r'(\S*)\s*=\s*(\S*)', update))

    # UCI Statlog (Heart) dataset.
    csv_file = tf.keras.utils.get_file(
        'heart.csv',
        'http://storage.googleapis.com/download.tensorflow.org/data/heart.csv')
    df = pd.read_csv(csv_file)
    target = df.pop('target')
    train_size = int(len(df) * 0.8)
    train_x = df[:train_size]
    train_y = target[:train_size]
    test_x = df[train_size:]
    test_y = target[train_size:]

    # feature_analysis_input_fn is used to collect statistics about the input
    # features, thus requiring only one loop of the dataset.
    #
    # feature_analysis_input_fn is required if you have at least one FeatureConfig
    # with "pwl_calibration_input_keypoints='quantiles'". Note that 'quantiles' is
    # default keypoints configuration so most likely you'll need it.
    feature_analysis_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
        x=train_x,
        y=train_y,
        shuffle=False,
        batch_size=FLAGS.batch_size,
        num_epochs=1,
        num_threads=1)

    # prefitting_input_fn is used to prefit an initial ensemble that is used to
    # estimate feature interactions. This prefitting step does not need to fully
    # converge and thus requiring fewer epochs than the main training.
    #
    # prefitting_input_fn is only required if your model_config is
    # CalibratedLatticeEnsembleConfig with "lattices='crystals'"
    prefitting_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
        x=train_x,
        y=train_y,
        shuffle=True,
        batch_size=FLAGS.batch_size,
        num_epochs=FLAGS.prefitting_num_epochs,
        num_threads=1)

    train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
        x=train_x,
        y=train_y,
        shuffle=True,
        batch_size=FLAGS.batch_size,
        num_epochs=FLAGS.num_epochs,
        num_threads=1)

    test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(
        x=test_x,
        y=test_y,
        shuffle=False,
        batch_size=FLAGS.batch_size,
        num_epochs=FLAGS.num_epochs,
        num_threads=1)

    # Feature columns.
    # - age
    # - sex
    # - cp        chest pain type (4 values)
    # - trestbps  resting blood pressure
    # - chol      serum cholestoral in mg/dl
    # - fbs       fasting blood sugar > 120 mg/dl
    # - restecg   resting electrocardiographic results (values 0,1,2)
    # - thalach   maximum heart rate achieved
    # - exang     exercise induced angina
    # - oldpeak   ST depression induced by exercise relative to rest
    # - slope     the slope of the peak exercise ST segment
    # - ca        number of major vessels (0-3) colored by flourosopy
    # - thal      3 = normal; 6 = fixed defect; 7 = reversable defect
    feature_columns = [
        fc.numeric_column('age', default_value=-1),
        fc.categorical_column_with_vocabulary_list('sex', [0, 1]),
        fc.numeric_column('cp'),
        fc.numeric_column('trestbps', default_value=-1),
        fc.numeric_column('chol'),
        fc.categorical_column_with_vocabulary_list('fbs', [0, 1]),
        fc.categorical_column_with_vocabulary_list('restecg', [0, 1, 2]),
        fc.numeric_column('thalach'),
        fc.categorical_column_with_vocabulary_list('exang', [0, 1]),
        fc.numeric_column('oldpeak'),
        fc.categorical_column_with_vocabulary_list('slope', [0, 1, 2]),
        fc.numeric_column('ca'),
        fc.categorical_column_with_vocabulary_list(
            'thal', ['normal', 'fixed', 'reversible']),
    ]

    # Feature configs are used to specify how each feature is calibrated and used.
    feature_configs = [
        configs.FeatureConfig(
            name='age',
            lattice_size=3,
            # By default, input keypoints of pwl are quantiles of the feature.
            pwl_calibration_num_keypoints=5,
            monotonicity='increasing',
            pwl_calibration_clip_max=100,
        ),
        configs.FeatureConfig(
            name='cp',
            pwl_calibration_num_keypoints=4,
            # Keypoints can be uniformly spaced.
            pwl_calibration_input_keypoints='uniform',
            monotonicity='increasing',
        ),
        configs.FeatureConfig(
            name='chol',
            # Explicit input keypoint initialization.
            pwl_calibration_input_keypoints=[
                126.0, 210.0, 247.0, 286.0, 564.0
            ],
            monotonicity='increasing',
            pwl_calibration_clip_min=130,
            # Calibration can be forced to span the full output range by clamping.
            pwl_calibration_clamp_min=True,
            pwl_calibration_clamp_max=True,
            # Per feature regularization.
            regularizer_configs=[
                configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
            ],
        ),
        configs.FeatureConfig(
            name='fbs',
            # Monotonicity: output for 1 should be larger than output for 0.
            monotonicity=[(0, 1)],
        ),
        configs.FeatureConfig(
            name='trestbps',
            pwl_calibration_num_keypoints=5,
            monotonicity='decreasing',
        ),
        configs.FeatureConfig(
            name='thalach',
            pwl_calibration_num_keypoints=5,
            monotonicity='decreasing',
        ),
        configs.FeatureConfig(
            name='restecg',
            # Categorical monotonicity can be partial order.
            monotonicity=[(0, 1), (0, 2)],
        ),
        configs.FeatureConfig(
            name='exang',
            monotonicity=[(0, 1)],
        ),
        configs.FeatureConfig(
            name='oldpeak',
            pwl_calibration_num_keypoints=5,
            monotonicity='increasing',
        ),
        configs.FeatureConfig(
            name='slope',
            monotonicity=[(0, 1), (1, 2)],
        ),
        configs.FeatureConfig(
            name='ca',
            pwl_calibration_num_keypoints=4,
            monotonicity='increasing',
        ),
        configs.FeatureConfig(
            name='thal',
            monotonicity=[('normal', 'fixed'), ('normal', 'reversible')],
        ),
    ]

    # Serving input fn is used to create saved models.
    serving_input_fn = (
        tf.estimator.export.build_parsing_serving_input_receiver_fn(
            feature_spec=fc.make_parse_example_spec(feature_columns)))

    # Model config defines the model strcutre for the estimator.
    # This is calibrated linear model with outputput calibration: Inputs are
    # calibrated, linearly combined and the output of the linear layer is
    # calibrated again using a PWL function.
    model_config = configs.CalibratedLinearConfig(
        feature_configs=feature_configs,
        use_bias=True,
        output_calibration=True,
        regularizer_configs=[
            # Regularizer for the output calibrator.
            configs.RegularizerConfig(name='output_calib_hessian', l2=1e-4),
        ])
    # Update model configuration.
    # See tfl.configs.apply_updates for details.
    configs.apply_updates(model_config, config_updates)
    estimator = estimators.CannedClassifier(
        feature_columns=feature_columns,
        model_config=model_config,
        feature_analysis_input_fn=feature_analysis_input_fn,
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate))
    estimator.train(input_fn=train_input_fn)
    results = estimator.evaluate(input_fn=test_input_fn)
    print('Calibrated linear results: {}'.format(results))
    print('Calibrated linear model exported to {}'.format(
        estimator.export_saved_model(estimator.model_dir, serving_input_fn)))

    # This is calibrated lattice model: Inputs are calibrated, then combined
    # non-linearly using a lattice layer.
    model_config = configs.CalibratedLatticeConfig(
        feature_configs=feature_configs,
        regularizer_configs=[
            # Torsion regularizer applied to the lattice to make it more linear.
            configs.RegularizerConfig(name='torsion', l2=1e-4),
            # Globally defined calibration regularizer is applied to all features.
            configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
        ])
    estimator = estimators.CannedClassifier(
        feature_columns=feature_columns,
        model_config=model_config,
        feature_analysis_input_fn=feature_analysis_input_fn,
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate))
    estimator.train(input_fn=train_input_fn)
    results = estimator.evaluate(input_fn=test_input_fn)
    print('Calibrated lattice results: {}'.format(results))
    print('Calibrated lattice model exported to {}'.format(
        estimator.export_saved_model(estimator.model_dir, serving_input_fn)))

    # This is random lattice ensemble model with separate calibration:
    # model output is the average output of separately calibrated lattices.
    model_config = configs.CalibratedLatticeEnsembleConfig(
        feature_configs=feature_configs,
        num_lattices=6,
        lattice_rank=5,
        separate_calibrators=True,
        regularizer_configs=[
            # Torsion regularizer applied to the lattice to make it more linear.
            configs.RegularizerConfig(name='torsion', l2=1e-4),
            # Globally defined calibration regularizer is applied to all features.
            configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
        ])
    configs.apply_updates(model_config, config_updates)
    estimator = estimators.CannedClassifier(
        feature_columns=feature_columns,
        model_config=model_config,
        feature_analysis_input_fn=feature_analysis_input_fn,
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate))
    estimator.train(input_fn=train_input_fn)
    results = estimator.evaluate(input_fn=test_input_fn)
    print('Random ensemble results: {}'.format(results))
    print('Random ensemble model exported to {}'.format(
        estimator.export_saved_model(estimator.model_dir, serving_input_fn)))

    # This is Crystals ensemble model with separate calibration: model output is
    # the average output of separately calibrated lattices.
    # Crystals algorithm first trains a prefitting model and uses the interactions
    # between features to form the final lattice ensemble.
    model_config = configs.CalibratedLatticeEnsembleConfig(
        feature_configs=feature_configs,
        # Using Crystals algorithm.
        lattices='crystals',
        num_lattices=6,
        lattice_rank=5,
        separate_calibrators=True,
        regularizer_configs=[
            # Torsion regularizer applied to the lattice to make it more linear.
            configs.RegularizerConfig(name='torsion', l2=1e-4),
            # Globally defined calibration regularizer is applied to all features.
            configs.RegularizerConfig(name='calib_hessian', l2=1e-4),
        ])
    configs.apply_updates(model_config, config_updates)
    estimator = estimators.CannedClassifier(
        feature_columns=feature_columns,
        model_config=model_config,
        feature_analysis_input_fn=feature_analysis_input_fn,
        # prefitting_input_fn is required to train the prefitting model.
        prefitting_input_fn=prefitting_input_fn,
        optimizer=tf.keras.optimizers.Adam(FLAGS.learning_rate))
    estimator.train(input_fn=train_input_fn)
    results = estimator.evaluate(input_fn=test_input_fn)
    print('Crystals ensemble results: {}'.format(results))
    print('Crystals ensemble model exported to {}'.format(
        estimator.export_saved_model(estimator.model_dir, serving_input_fn)))
示例#18
0
    print('A batch of targets:', label_batch)

# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]


# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())


#Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset

age = feature_column.numeric_column('Age')
print("Demo AGE")
demo(age)

age_buckets = feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 75, 85, 90])
print('Demo AGE buckets')
demo(age_buckets)

gender = feature_column.categorical_column_with_vocabulary_list(
    'Gender', ['Male', 'Female'])

gender_one_hot = feature_column.indicator_column(gender)
print('Gender one hot')
demo(gender_one_hot)
#for feature_batch, label_batch in train_ds.take(1):
  #print('Every feature:', list(feature_batch.keys()))
  #print('A batch of YTG:', feature_batch['YardsToGo'])
  #print('A batch of targets:', label_batch)
  #print(tf.shape(label_batch))

example_batch = next(iter(train_ds))[0]

def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

feature_columns = []
for header in ['Down', 'ScoreDifferential', 'CurrentQuarter', 'OffenseSPRank', 'RushRank', 'PassRank', 'OpponentPlay', 'YardsToGo', 'YardsToGoal', 'Location']:
  feature_columns.append(feature_column.numeric_column(header))

Down = feature_column.numeric_column("Down")
Quarter = feature_column.numeric_column("CurrentQuarter")
Score = feature_column.numeric_column("ScoreDifferential")
SP = feature_column.numeric_column("OffenseSPRank")
RushR = feature_column.numeric_column("RushRank")
PassR = feature_column.numeric_column("PassRank")
YTGo = feature_column.numeric_column("YardsToGo")
YTGoal = feature_column.numeric_column("YardsToGoal")
Play = feature_column.categorical_column_with_vocabulary_list('OpponentPlay', ['Pass', 'Run'])
Location = feature_column.categorical_column_with_vocabulary_list('Location', ['Home', 'Neutral', 'Away'])

Down_buckets = feature_column.bucketized_column(Down, boundaries=[4])
feature_columns.append(Down_buckets)
Quarter_buckets = feature_column.bucketized_column(Quarter, boundaries=[2, 3, 4])
示例#20
0
def create_feature_columns():
    c2id = fc.categorical_column_with_hash_bucket("cate2Id",
                                                  5000,
                                                  dtype=tf.int64)
    modified_time = fc.numeric_column("modified_time", default_value=0.0)
    modified_time_sqrt = fc.numeric_column("modified_time_sqrt",
                                           default_value=0.0)
    modified_time_square = fc.numeric_column("modified_time_square",
                                             default_value=0.0)
    props_sex = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("props_sex",
                                                   ["男", "女", "通用", "情侣"],
                                                   default_value=0))
    brand_grade = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list(
            "brand_grade", ["A类品牌", "B类品牌", "C类品牌", "D类品牌"], default_value=0))
    shipment_rate = fc.numeric_column("shipment_rate", default_value=0.0)
    shipping_rate = fc.numeric_column("shipping_rate", default_value=0.0)
    ipv_ntile = fc.bucketized_column(
        fc.numeric_column("ipv_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    pay_ntile = fc.bucketized_column(
        fc.numeric_column("pay_ntile", dtype=tf.int64, default_value=99),
        boundaries=[1, 2, 3, 4, 5, 10, 20, 50, 80])
    price = fc.numeric_column("price_norm", default_value=0.0)
    ctr_1d = fc.numeric_column("ctr_1d", default_value=0.0)
    cvr_1d = fc.numeric_column("cvr_1d", default_value=0.0)
    uv_cvr_1d = fc.numeric_column("uv_cvr_1d", default_value=0.0)
    ctr_1w = fc.numeric_column("ctr_1w", default_value=0.0)
    cvr_1w = fc.numeric_column("cvr_1w", default_value=0.0)
    uv_cvr_1w = fc.numeric_column("uv_cvr_1w", default_value=0.0)
    ctr_2w = fc.numeric_column("ctr_2w", default_value=0.0)
    cvr_2w = fc.numeric_column("cvr_2w", default_value=0.0)
    uv_cvr_2w = fc.numeric_column("uv_cvr_2w", default_value=0.0)
    ctr_1m = fc.numeric_column("ctr_1m", default_value=0.0)
    cvr_1m = fc.numeric_column("cvr_1m", default_value=0.0)
    uv_cvr_1m = fc.numeric_column("uv_cvr_1m", default_value=0.0)
    pay_qty_1d = fc.numeric_column("pay_qty_1d", default_value=0.0)
    pay_qty_1w = fc.numeric_column("pay_qty_1w", default_value=0.0)
    pay_qty_2w = fc.numeric_column("pay_qty_2w", default_value=0.0)
    pay_qty_1m = fc.numeric_column("pay_qty_1m", default_value=0.0)
    cat2_pay_qty = fc.numeric_column("cat2_pay_qty_1d", default_value=0.0)
    cat1_pay_qty = fc.numeric_column("cat1_pay_qty_1d", default_value=0.0)
    brd_pay_qty = fc.numeric_column("brd_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1d = fc.numeric_column("slr_pay_qty_1d", default_value=0.0)
    slr_pay_qty_1w = fc.numeric_column("slr_pay_qty_1w", default_value=0.0)
    slr_pay_qty_2w = fc.numeric_column("slr_pay_qty_2w", default_value=0.0)
    slr_pay_qty_1m = fc.numeric_column("slr_pay_qty_1m", default_value=0.0)
    slr_brd_pay_qty_1d = fc.numeric_column("slr_brd_pay_qty_1d",
                                           default_value=0.0)
    slr_brd_pay_qty_1w = fc.numeric_column("slr_brd_pay_qty_1w",
                                           default_value=0.0)
    slr_brd_pay_qty_2w = fc.numeric_column("slr_brd_pay_qty_2w",
                                           default_value=0.0)
    slr_brd_pay_qty_1m = fc.numeric_column("slr_brd_pay_qty_1m",
                                           default_value=0.0)
    weighted_ipv = fc.numeric_column("weighted_ipv", default_value=0.0)
    cat1_weighted_ipv = fc.numeric_column("cat1_weighted_ipv",
                                          default_value=0.0)
    cate_weighted_ipv = fc.numeric_column("cate_weighted_ipv",
                                          default_value=0.0)
    slr_weighted_ipv = fc.numeric_column("slr_weighted_ipv", default_value=0.0)
    brd_weighted_ipv = fc.numeric_column("brd_weighted_ipv", default_value=0.0)
    cms_scale = fc.numeric_column("cms_scale", default_value=0.0)
    cms_scale_sqrt = fc.numeric_column("cms_scale_sqrt", default_value=0.0)

    # context feature
    matchScore = fc.numeric_column("matchScore", default_value=0.0)
    popScore = fc.numeric_column("popScore", default_value=0.0)
    brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
    cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
    catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
    sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
    matchType = fc.indicator_column(
        fc.categorical_column_with_identity("matchType", 9, default_value=0))
    position = fc.bucketized_column(fc.numeric_column("position",
                                                      dtype=tf.int64,
                                                      default_value=301),
                                    boundaries=[
                                        1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 20, 30,
                                        40, 50, 80, 100, 150, 200, 300
                                    ])
    triggerNum = fc.indicator_column(
        fc.categorical_column_with_identity("triggerNum", 41,
                                            default_value=40))
    triggerRank = fc.indicator_column(
        fc.categorical_column_with_identity("triggerRank",
                                            41,
                                            default_value=40))
    sceneType = fc.indicator_column(
        fc.categorical_column_with_identity("type", 2, default_value=0))
    hour = fc.indicator_column(
        fc.categorical_column_with_identity("hour", 24, default_value=0))
    phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000)
    phoneBrand = fc.shared_embedding_columns([phoneBrandId], 20)
    phoneResolutionId = fc.categorical_column_with_hash_bucket(
        "phoneResolution", 500)
    phoneResolution = fc.shared_embedding_columns([phoneResolutionId], 10)
    phoneOs = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("phoneOs",
                                                   ["android", "ios"],
                                                   default_value=0))
    tab = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("tab", [
            "ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang",
            "JuJia", "MeiShi"
        ],
                                                   default_value=0))

    c2id_embed = fc.shared_embedding_columns(
        [c2id], 16, shared_embedding_collection_name="c2id")
    feature_columns = [
        matchScore, matchType, position, triggerNum, triggerRank, sceneType,
        hour, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer,
        catePrefer, price, props_sex, brand_grade, modified_time,
        modified_time_sqrt, modified_time_square, shipment_rate, shipping_rate,
        ipv_ntile, pay_ntile, uv_cvr_1d, uv_cvr_1w, uv_cvr_2w, uv_cvr_1m,
        ctr_1d, ctr_1w, ctr_2w, ctr_1m, cvr_1d, cvr_1w, cvr_2w, cvr_1m,
        pay_qty_1d, pay_qty_1w, pay_qty_2w, pay_qty_1m, cat2_pay_qty,
        cat1_pay_qty, brd_pay_qty, slr_pay_qty_1d, slr_pay_qty_1w,
        slr_pay_qty_2w, slr_pay_qty_1m, slr_brd_pay_qty_1d, slr_brd_pay_qty_1w,
        slr_brd_pay_qty_2w, slr_brd_pay_qty_1m, weighted_ipv,
        cat1_weighted_ipv, cate_weighted_ipv, slr_weighted_ipv,
        brd_weighted_ipv, cms_scale, cms_scale_sqrt
    ]
    feature_columns += c2id_embed
    feature_columns += phoneResolution
    feature_columns += phoneBrand
    return feature_columns
示例#21
0
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.python.lib.io import file_io
from tensorflow.python.ops import init_ops
from .eval_metrics import AverageNClass, HitAtOne

N_CLASS = 3862
BATCH_SIZE = 1024
VOCAB_FILE = "data/vocabulary.csv"
# Exclude audio feature since we didn't implement audio feature extraction.
# Even if the model can be trained on audio feature,
# they won't be available for inference on new video.
FEAT_COL_VIDEO = [
    fc.numeric_column(key="mean_rgb", shape=(1024, ), dtype=tf.float32),
    #fc.numeric_column(key="mean_audio", shape=(128,), dtype=tf.float32),
    fc.indicator_column(
        fc.categorical_column_with_identity(key="labels", num_buckets=N_CLASS))
]
FEAT_X = ["mean_rgb"]
FEAT_SPEC_VIDEO = fc.make_parse_example_spec(FEAT_COL_VIDEO)
MULTI_HOT_ENCODER = tf.keras.layers.DenseFeatures(FEAT_COL_VIDEO[-1])
# If we'd like to use a custom serving input function, we need to use the estimator API.
# There is no document on how a keras model can use a custom serving input function.
KERAS_TO_ESTIMATOR = True


def calc_class_weight(infile, scale=1):
    """Calculate class weight to re-balance label distribution.
    The class weight for class i (w_i) is determined by:
示例#22
0
 def _sparse_fc(sparse_feature: SparseFeature):
     return fc.numeric_column(key=sparse_feature.feature_name,
                              dtype=tf.int64)
示例#23
0
    def create_feature_columns(self):
        _NUMERIC_COLUMNS = ['posted_time', 'owner_influence', 'poster_influence', 'participant1_influence',
                            'participant2_influence', 'participant3_influence', 'participant4_influence',
                            'participant5_influence']

        _BINARY_COLUMNS = ["is_mentions_me", "is_mentions_connections", "is_commented_by_me",
                           "is_commented_by_connections", "is_liked_by_me", "is_liked_by_connections",
                           "poster_is_employee","poster_is_in_connections",
                           "participant1_is_employee", "participant1_is_in_connections",
                           "participant2_is_employee", "participant2_is_in_connections",
                           "participant3_is_employee", "participant3_is_in_connections",
                           "participant4_is_employee", "participant4_is_in_connections",
                           "participant5_is_employee", "participant5_is_in_connections"]

        _GENDER_COLUMNS = ["poster_gender", "participant1_gender", "participant2_gender", "participant3_gender",
                           "participant4_gender", "participant5_gender"]

        self.real = {
            colname: feature_column.numeric_column(colname) \
            for colname in _NUMERIC_COLUMNS
        }

        self.sparse = dict()

        app_type = feature_column.categorical_column_with_vocabulary_list(
            'app_type', self.df.app_type.unique())
        app_type_1hot = feature_column.indicator_column(app_type)
        self.sparse["app_type"] = app_type_1hot

        owner_type = feature_column.categorical_column_with_vocabulary_list(
            'owner_type', self.df.owner_type.unique())
        owner_type_1hot = feature_column.indicator_column(owner_type)
        self.sparse["owner_type"] = owner_type_1hot

        poster_focus = feature_column.categorical_column_with_vocabulary_list(
            'poster_focus', ['engineering', 'sales', 'marketing', 'management', 'financial', 'other'])
        poster_focus_1hot = feature_column.indicator_column(poster_focus)
        self.sparse["poster_focus"] = poster_focus_1hot

        for col in _GENDER_COLUMNS:
            feature = feature_column.categorical_column_with_vocabulary_list(col, self.df[col].unique())
            feature_1hot = feature_column.indicator_column(feature)
            self.sparse[col] = feature_1hot

        participant1_action = participant_action("participant1_action")
        participant2_action = participant_action("participant2_action")
        participant3_action = participant_action("participant3_action")
        participant4_action = participant_action("participant4_action")
        participant5_action = participant_action("participant5_action")

        participant1_focus = participant_focus("participant1_focus")
        participant2_focus = participant_focus("participant2_focus")
        participant3_focus = participant_focus("participant3_focus")
        participant4_focus = participant_focus("participant4_focus")
        participant5_focus = participant_focus("participant5_focus")

        self.sparse["participant2_action"] = feature_column.indicator_column(participant2_action)
        self.sparse["participant3_action"] = feature_column.indicator_column(participant3_action)
        self.sparse["participant1_action"] = feature_column.indicator_column(participant1_action)
        self.sparse["participant4_action"] = feature_column.indicator_column(participant4_action)
        self.sparse["participant5_action"] = feature_column.indicator_column(participant5_action)

        self.sparse["participant1_focus"] = feature_column.indicator_column(participant1_focus)
        self.sparse["participant2_focus"] = feature_column.indicator_column(participant2_focus)
        self.sparse["participant3_focus"] = feature_column.indicator_column(participant3_focus)
        self.sparse["participant4_focus"] = feature_column.indicator_column(participant4_focus)
        self.sparse["participant5_focus"] = feature_column.indicator_column(participant5_focus)

        self.inputs = {
            colname: tf.keras.layers.Input(name=colname, shape=(), dtype='float32') \
            for colname in self.real.keys()
        }

        self.inputs.update({
            colname: tf.keras.layers.Input(name=colname, shape=(), dtype='string') \
            for colname in self.sparse.keys()
        })

        for col in _BINARY_COLUMNS:
            feature = feature_column.categorical_column_with_vocabulary_list(
                col, self.df[col].unique())
            feature_1hot = feature_column.indicator_column(feature)
            self.sparse[col] = feature_1hot

        likes_num = feature_column.numeric_column("number_of_likes")
        likes_num_buckets = feature_column.bucketized_column(likes_num, boundaries=[2, 5, 10, 20, 50, 100])
        self.sparse["number_of_likes"] = likes_num_buckets

        comments_num = feature_column.numeric_column("number_of_comments")
        comments_num_buckets = feature_column.bucketized_column(comments_num, boundaries=[1, 2, 5, 10, 20, 50, 100])
        self.sparse["number_of_comments"] = comments_num_buckets

        age_boundaries = [30 * _ONE_MIN, _ONE_HOUR, 2 * _ONE_HOUR, 3 * _ONE_HOUR, 4 * _ONE_HOUR, 24 * _ONE_HOUR]

        age = feature_column.numeric_column("effective_age_long")
        age_buckets = feature_column.bucketized_column(age, boundaries=age_boundaries)
        self.sparse["effective_age_long"] = age_buckets

        daytime = feature_column.numeric_column("daytime")
        daytime_buckets = feature_column.bucketized_column(daytime, boundaries=[8, 12, 16, 24])
        self.sparse["daytime"] = daytime_buckets

        weekday = feature_column.categorical_column_with_vocabulary_list(
            'weekday', self.df.weekday.unique())
        weekday_1hot = feature_column.indicator_column(weekday)
        self.sparse["weekday"] = weekday_1hot

        self.inputs.update({
            colname: tf.keras.layers.Input(name=colname, shape=(), dtype='int64') \
            for colname in
            _BINARY_COLUMNS + ["number_of_likes", "number_of_comments", "effective_age_long", "daytime", "weekday"]
        })

        # hash_bucket_size=30 because there are 7 possible values in weekday and 4 in daytime, all possible combinations will be 28 ~ 30
        weekday_x_daytime = feature_column.crossed_column([weekday, daytime_buckets], hash_bucket_size=30)
        self.sparse["weekday_x_daytime"] = feature_column.indicator_column(weekday_x_daytime)

        # 6 bins in likes and 7 in comments
        likes_x_comments = feature_column.crossed_column([likes_num_buckets, comments_num_buckets], hash_bucket_size=45)
        self.sparse["likes_x_comments"] = feature_column.indicator_column(likes_x_comments)

        # 6 bins in likes, 3 in action and 7 in focus, 6*3*7=126~130
        likes_x_participant1_focus_n_action = feature_column.crossed_column(
            [likes_num_buckets, participant1_action, participant1_focus], hash_bucket_size=130)
        self.sparse["likes_x_participant1_focus_n_action"] = feature_column.indicator_column(
            likes_x_participant1_focus_n_action)

        likes_x_participant2_focus_n_action = feature_column.crossed_column(
            [likes_num_buckets, participant2_action, participant2_focus], hash_bucket_size=130)
        self.sparse["likes_x_participant2_focus_n_action"] = feature_column.indicator_column(
            likes_x_participant2_focus_n_action)

        likes_x_participant3_focus_n_action = feature_column.crossed_column(
            [likes_num_buckets, participant3_action, participant3_focus], hash_bucket_size=130)
        self.sparse["likes_x_participant3_focus_n_action"] = feature_column.indicator_column(
            likes_x_participant3_focus_n_action)

        likes_x_participant4_focus_n_action = feature_column.crossed_column(
            [likes_num_buckets, participant4_action, participant4_focus], hash_bucket_size=130)
        self.sparse["likes_x_participant4_focus_n_action"] = feature_column.indicator_column(
            likes_x_participant4_focus_n_action)

        likes_x_participant5_focus_n_action = feature_column.crossed_column(
            [likes_num_buckets, participant5_action, participant5_focus], hash_bucket_size=130)
        self.sparse["likes_x_participant5_focus_n_action"] = feature_column.indicator_column(
            likes_x_participant5_focus_n_action)
示例#24
0
 def _dense_fc(dense_feature: DenseFeature):
     return fc.numeric_column(key=dense_feature.feature_name)
示例#25
0
def create_feature_columns(df, which_lanes):
    transformed_df = df.copy()
    all_lanes = which_lanes.copy()
    champion_list = champ_info.id.tolist()
    feature_columns = []
    good_numeric = list(map(lambda x: "{}_good_numeric".format(x), all_lanes))
    for header in good_numeric:
        feature_columns.append(
            feature_column.numeric_column(header, shape=(22, )))
    bad_numeric = list(map(lambda x: "{}_bad_numeric".format(x), all_lanes))
    for header in bad_numeric:
        feature_columns.append(
            feature_column.numeric_column(header, shape=(3, )))
    good_categ = list(map(lambda x: "{}_good_categ".format(x), all_lanes))
    for header in good_categ:
        feature_columns.append(
            feature_column.numeric_column(header, shape=(4, )))
    bad_categ = list(map(lambda x: "{}_bad_categ".format(x), all_lanes))
    for header in bad_categ:
        feature_columns.append(
            feature_column.numeric_column(header, shape=(2, )))
    TOP100 = feature_column.categorical_column_with_vocabulary_list(
        "TOP100_champ", champion_list)
    TOP200 = feature_column.categorical_column_with_vocabulary_list(
        "TOP200_champ", champion_list)
    TOP_crossed = feature_column.crossed_column([TOP100, TOP200],
                                                hash_bucket_size=1000)
    TOP_crossed = feature_column.indicator_column(TOP_crossed)
    feature_columns.append(TOP_crossed)

    JUNGLE100 = feature_column.categorical_column_with_vocabulary_list(
        "JUNGLE100_champ", champion_list)
    JUNGLE200 = feature_column.categorical_column_with_vocabulary_list(
        "JUNGLE200_champ", champion_list)
    JUNGLE_crossed = feature_column.crossed_column([JUNGLE100, JUNGLE200],
                                                   hash_bucket_size=1000)
    JUNGLE_crossed = feature_column.indicator_column(JUNGLE_crossed)
    feature_columns.append(JUNGLE_crossed)

    MID100 = feature_column.categorical_column_with_vocabulary_list(
        "MID100_champ", champion_list)
    MID200 = feature_column.categorical_column_with_vocabulary_list(
        "MID200_champ", champion_list)
    MID_crossed = feature_column.crossed_column([MID100, MID200],
                                                hash_bucket_size=1000)
    MID_crossed = feature_column.indicator_column(MID_crossed)
    feature_columns.append(MID_crossed)

    ADC100 = feature_column.categorical_column_with_vocabulary_list(
        "ADC100_champ", champion_list)
    ADC200 = feature_column.categorical_column_with_vocabulary_list(
        "ADC200_champ", champion_list)
    ADC_crossed = feature_column.crossed_column([ADC100, ADC200],
                                                hash_bucket_size=1000)
    ADC_crossed = feature_column.indicator_column(ADC_crossed)
    feature_columns.append(ADC_crossed)

    SUPPORT100 = feature_column.categorical_column_with_vocabulary_list(
        "SUPPORT100_champ", champion_list)
    SUPPORT200 = feature_column.categorical_column_with_vocabulary_list(
        "SUPPORT200_champ", champion_list)
    SUPPORT_crossed = feature_column.crossed_column([SUPPORT100, SUPPORT200],
                                                    hash_bucket_size=1000)
    SUPPORT_crossed = feature_column.indicator_column(SUPPORT_crossed)
    feature_columns.append(SUPPORT_crossed)

    return feature_columns
示例#26
0
  

  
#Configure the train_inpf to iterate over the data twice:
import functools

train_inpf = functools.partial(census_dataset.input_fn, train_file, num_epochs=2, shuffle=True, batch_size=64)
test_inpf = functools.partial(census_dataset.input_fn, test_file, num_epochs=1, shuffle=False, batch_size=64)



##Selecting and Engineering Features for the Model
### Base Feature Columns
####Numericals

age = fc.numeric_column('age')

#Train and evaluate a model using only the age feature:
classifier = tf.estimator.LinearClassifier(feature_columns=[age])
classifier.train(train_inpf)
result = classifier.evaluate(test_inpf)

clear_output()  # used for display in notebook
print(result)


#We define a NumericColumn for each continuous feature column that we want to use in the model:
education_num = tf.feature_column.numeric_column('education_num')
capital_gain = tf.feature_column.numeric_column('capital_gain')
capital_loss = tf.feature_column.numeric_column('capital_loss')
hours_per_week = tf.feature_column.numeric_column('hours_per_week')
示例#27
0
    ds = ds.batch(batch_size)
    return ds


example_batch = next(iter(df_to_dataset(dataframe)))[0]


# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())


print('Pre processing age==========================>')
age = feature_column.numeric_column("age")
demo(age)

age_buckets = feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
#demo(age_buckets)
#type(age_buckets)

#dataframe.pop('age')
#age_buckets_series = pd.Series(age_buckets)
print('age buckets series====================>')
demo(age_buckets)
age_buckets_df = pd.DataFrame({'age': age_buckets})
print('age buckets series====================>')

#dataframe.join(age_buckets_column)
示例#28
0
def create_feature_columns():
    # user feature
    bids = fc.categorical_column_with_hash_bucket("behaviorBids",
                                                  10240,
                                                  dtype=tf.int64)
    c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids",
                                                   100,
                                                   dtype=tf.int64)
    cids = fc.categorical_column_with_hash_bucket("behaviorCids",
                                                  10240,
                                                  dtype=tf.int64)
    sids = fc.categorical_column_with_hash_bucket("behaviorSids",
                                                  10240,
                                                  dtype=tf.int64)
    pids = fc.categorical_column_with_hash_bucket("behaviorPids",
                                                  1000000,
                                                  dtype=tf.int64)
    bids_weighted = fc.weighted_categorical_column(bids, "bidWeights")
    c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights")
    cids_weighted = fc.weighted_categorical_column(cids, "cidWeights")
    sids_weighted = fc.weighted_categorical_column(sids, "sidWeights")
    pids_weighted = fc.weighted_categorical_column(pids, "pidWeights")

    # item feature
    pid = fc.categorical_column_with_hash_bucket("productId",
                                                 1000000,
                                                 dtype=tf.int64)
    sid = fc.categorical_column_with_hash_bucket("sellerId",
                                                 10240,
                                                 dtype=tf.int64)
    bid = fc.categorical_column_with_hash_bucket("brandId",
                                                 10240,
                                                 dtype=tf.int64)
    c1id = fc.categorical_column_with_hash_bucket("cate1Id",
                                                  100,
                                                  dtype=tf.int64)
    cid = fc.categorical_column_with_hash_bucket("cateId",
                                                 10240,
                                                 dtype=tf.int64)

    # context feature
    matchScore = fc.numeric_column("matchScore", default_value=0.0)
    popScore = fc.numeric_column("popScore", default_value=0.0)
    brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0)
    cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0)
    catePrefer = fc.numeric_column("catePrefer", default_value=0.0)
    sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0)
    matchType = fc.indicator_column(
        fc.categorical_column_with_identity("matchType", 9, default_value=0))
    postition = fc.indicator_column(
        fc.categorical_column_with_identity("position", 201,
                                            default_value=200))
    triggerNum = fc.indicator_column(
        fc.categorical_column_with_identity("triggerNum", 51,
                                            default_value=50))
    triggerRank = fc.indicator_column(
        fc.categorical_column_with_identity("triggerRank",
                                            51,
                                            default_value=50))
    sceneType = fc.indicator_column(
        fc.categorical_column_with_identity("type", 2, default_value=0))
    hour = fc.indicator_column(
        fc.categorical_column_with_identity("hour", 24, default_value=0))
    phoneBrand = fc.indicator_column(
        fc.categorical_column_with_hash_bucket("phoneBrand", 1000))
    phoneResolution = fc.indicator_column(
        fc.categorical_column_with_hash_bucket("phoneResolution", 500))
    phoneOs = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("phoneOs",
                                                   ["android", "ios"],
                                                   default_value=0))
    tab = fc.indicator_column(
        fc.categorical_column_with_vocabulary_list("tab", [
            "ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang",
            "JuJia", "MeiShi"
        ],
                                                   default_value=0))

    pid_embed = fc.shared_embedding_columns(
        [pids_weighted, pid],
        64,
        combiner='sum',
        shared_embedding_collection_name="pid")
    bid_embed = fc.shared_embedding_columns(
        [bids_weighted, bid],
        32,
        combiner='sum',
        shared_embedding_collection_name="bid")
    cid_embed = fc.shared_embedding_columns(
        [cids_weighted, cid],
        32,
        combiner='sum',
        shared_embedding_collection_name="cid")
    c1id_embed = fc.shared_embedding_columns(
        [c1ids_weighted, c1id],
        10,
        combiner='sum',
        shared_embedding_collection_name="c1id")
    sid_embed = fc.shared_embedding_columns(
        [sids_weighted, sid],
        32,
        combiner='sum',
        shared_embedding_collection_name="sid")
    global my_feature_columns
    my_feature_columns = [
        matchScore, matchType, postition, triggerNum, triggerRank, sceneType,
        hour, phoneBrand, phoneResolution, phoneOs, tab, popScore,
        sellerPrefer, brandPrefer, cate2Prefer, catePrefer
    ]
    my_feature_columns += pid_embed
    my_feature_columns += sid_embed
    my_feature_columns += bid_embed
    my_feature_columns += cid_embed
    my_feature_columns += c1id_embed
    print("feature columns:", my_feature_columns)
    return my_feature_columns
示例#29
0
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['Age'])
  print('A batch of targets:', label_batch )

# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

#Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset

age = feature_column.numeric_column('Age')
print("Demo AGE")
demo(age)

ronaldo = feature_column.numeric_column('Age')
print('Demo Ronaldo')
demo(ronaldo)

age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 75, 85, 90])
print('Demo AGE buckets')
demo(age_buckets)

gender  = feature_column.categorical_column_with_vocabulary_list(
      'Gender', ['Male', 'Female'])

gender_one_hot = feature_column.indicator_column(gender)
def official_census_feature_columns_config_demo():
    # categorical_column
    gender = fc.categorical_column_with_vocabulary_list(
        'gender', ['Female', 'Male'])
    education = fc.categorical_column_with_vocabulary_list(
        'education', [
            'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
            'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
            '5th-6th', '10th', '1st-4th', 'Preschool', '12th'
        ])
    marital_status = fc.categorical_column_with_vocabulary_list(
        'marital_status', [
            'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
            'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'
        ])
    relationship = fc.categorical_column_with_vocabulary_list(
        'relationship', [
            'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
            'Other-relative'
        ])
    workclass = fc.categorical_column_with_vocabulary_list(
        'workclass', [
            'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
            'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'
        ])

    # To show an example of hashing:
    native_country = fc.categorical_column_with_hash_bucket(
        'native_country', hash_bucket_size=1000)
    occupation = fc.categorical_column_with_hash_bucket('occupation',
                                                        hash_bucket_size=1000)

    # Continuous feature columns.
    age = fc.numeric_column('age')
    education_num = fc.numeric_column('education_num')
    capital_gain = fc.numeric_column('capital_gain')
    capital_loss = fc.numeric_column('capital_loss')
    hours_per_week = fc.numeric_column('hours_per_week')

    # bucketized transformations.
    age_buckets = fc.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

    # Wide columns and deep columns.
    base_columns = [
        gender, education, marital_status, relationship, workclass, occupation,
        native_country, age_buckets
    ]
    crossed_columns = [
        fc.crossed_column(['education', 'occupation'], hash_bucket_size=1000),
        fc.crossed_column([age_buckets, 'education', 'occupation'],
                          hash_bucket_size=1000),
        fc.crossed_column(['native_country', 'occupation'],
                          hash_bucket_size=1000)
    ]
    feature_columns = [
        fc.indicator_column(workclass),
        fc.indicator_column(education),
        fc.indicator_column(gender),
        fc.indicator_column(relationship),
        fc.embedding_column(native_country, dimension=32),
        fc.embedding_column(occupation, dimension=32),
        age,
        education_num,
        capital_gain,
        capital_loss,
        hours_per_week,
    ]
    return feature_columns, base_columns, crossed_columns