def get_wide_deep(): # define column types races = ['White', 'Black', 'American Indian', 'Chinese', 'Japanese', 'Hawaiian', 'Filipino', 'Unknown', 'Asian Indian', 'Korean', 'Samaon', 'Vietnamese'] is_male,mother_age,mother_race,plurality,gestation_weeks,mother_married,cigarette_use,alcohol_use = \ [ \ tflayers.sparse_column_with_keys('is_male', keys=['True', 'False']), tflayers.real_valued_column('mother_age'), tflayers.sparse_column_with_keys('mother_race', keys=races), tflayers.real_valued_column('plurality'), tflayers.real_valued_column('gestation_weeks'), tflayers.sparse_column_with_keys('mother_married', keys=['True', 'False']), tflayers.sparse_column_with_keys('cigarette_use', keys=['True', 'False', 'None']), tflayers.sparse_column_with_keys('alcohol_use', keys=['True', 'False', 'None']) ] # which columns are wide (sparse, linear relationship to output) and which are deep (complex relationship to output?) wide = [is_male, mother_race, plurality, mother_married, cigarette_use, alcohol_use] deep = [\ mother_age, gestation_weeks, tflayers.embedding_column(mother_race, 3) ] return wide, deep
def train_and_eval(train_steps, log_dir, training_set, validation_set, testing_set, ): sparse_columns = [ layers.sparse_column_with_keys(attribute, training_set[attribute].unique()) for attribute in FEATURE_ATTRIBUTES ] embedding_columns = [ layers.embedding_column(column, dimension=8) for column in sparse_columns ] m = learn.DNNClassifier( hidden_units=[10, 50, ], feature_columns=embedding_columns, model_dir=log_dir, config=learn.RunConfig(save_checkpoints_secs=1, ), ) validation_metrics = { "accuracy": learn.MetricSpec(metric_fn=metrics.streaming_accuracy, prediction_key="classes"), "precision": learn.MetricSpec(metric_fn=metrics.streaming_precision, prediction_key="classes"), "recall": learn.MetricSpec(metric_fn=metrics.streaming_recall, prediction_key="classes"), } monitors = [ learn.monitors.ValidationMonitor( input_fn=lambda: input_fn(validation_set), every_n_steps=1000, metrics=validation_metrics, early_stopping_rounds=1, ), ] m.fit( input_fn=lambda: input_fn(training_set), steps=train_steps, monitors=monitors, ) results = m.evaluate(input_fn=lambda: input_fn(testing_set), steps=1) for key in sorted(results): print("%s: %s" % (key, results[key]))
def get_features_raw(): real = { colname : tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + ',dep_lat,dep_lon,arr_lat,arr_lon').split(',') } sparse = { 'carrier': tflayers.sparse_column_with_keys('carrier', keys='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(',')), 'origin' : tflayers.sparse_column_with_hash_bucket('origin', hash_bucket_size=1000), # FIXME 'dest' : tflayers.sparse_column_with_hash_bucket('dest', hash_bucket_size=1000) #FIXME } return real, sparse
def build_estimator(model_dir, model_type): """build an estimator""" # base sparse feature process gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male']) education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000) relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100) workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100) occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000) native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000) # base continuous feature age = layers.real_valued_column(column_name='age') education_num = layers.real_valued_column(column_name='education_num') capital_gain = layers.real_valued_column(column_name='capital_gain') capital_loss = layers.real_valued_column(column_name='capital_loss') hours_per_week = layers.real_valued_column(column_name='hours_per_week') # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性 age_bucket = layers.bucketized_column(source_column=age, boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65]) # wide columns and deep columns # 深度模型使用到的特征和广度模型使用到的特征 # 广度模型特征只只用到了分类标签 wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket, layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)), layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)), layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))] deep_columns = [layers.embedding_column(workclass, dimension=8), layers.embedding_column(education, dimension=8), layers.embedding_column(gender, dimension=8), layers.embedding_column(relationship, dimension=8), layers.embedding_column(native_country, dimension=8), layers.embedding_column(occupation, dimension=8), age, education_num, capital_gain, capital_loss, hours_per_week] if model_type == "wide": m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir) elif model_type == "deep": m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50]) else: m=learn.DNNLinearCombinedClassifier(model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[256, 128, 64], dnn_activation_fn=tf.nn.relu) return m
def part4(): global boston, x_data, y_data import pandas as pd import numpy as np N = 10000 weight = np.random.randn(N) * 5 + 70 spec_id = np.random.randint(0, 3, N) bias = [0.9, 1, 1.1] height = np.array( [weight[i] / 100 + bias[b] for i, b in enumerate(spec_id)]) spec_name = ['Goblin', 'Human', 'ManBear'] spec = [spec_name[s] for s in spec_id] df = pd.DataFrame({'Species': spec, 'Weight': weight, 'Height': height}) from tensorflow.contrib import layers Weight = layers.real_valued_column("Weight") Species = layers.sparse_column_with_keys(column_name="Species", keys=spec_name) reg = learn.LinearRegressor(feature_columns=[Weight, Species]) def input_fn(df): feature_cols = {} feature_cols['Weight'] = tf.constant(df['Weight'].values) feature_cols['Species'] = tf.SparseTensor( indices=[[i, 0] for i in range(df['Species'].size)], values=df['Species'].values, dense_shape=[df['Species'].size, 1]) labels = tf.constant(df['Height'].values) return feature_cols, labels reg.fit(input_fn=lambda: input_fn(df), steps=50000) w_w = reg.get_variable_value('linear/Weight/weight') print(f"Estimation for Weight: {w_w}") v = reg.get_variable_names() print(f"Classes: {v}") s_w = reg.get_variable_value('linear/Species/weights') b = reg.get_variable_value('linear/bias_weight') print(f"Estimation for Species: {s_w + b}")
def train_and_eval(model_dir, training_set, testing_set, ): sparse_columns = [ layers.sparse_column_with_keys( attribute['name'], pandas.read_csv(attribute['path'], sep='\t')['id'].apply(str), ) for attribute in FEATURE_ATTRIBUTES ] embedding_columns = [layers.embedding_column(column, dimension=3) for column in sparse_columns] model = learn.DNNRegressor( hidden_units=[3, ], feature_columns=embedding_columns, model_dir=model_dir, config=learn.RunConfig(save_checkpoints_secs=100, ), ) model.fit(input_fn=lambda: input_fn(training_set), steps=20000, ) results = model.evaluate(input_fn=lambda: input_fn(testing_set), steps=1) for key in sorted(results): print('%s: %s' % (key, results[key]))
def build_feature_cols(): # Sparse base columns. gender = layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"]) race = layers.sparse_column_with_keys(column_name="race", keys=[ "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White" ]) education = layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000) marital_status = layers.sparse_column_with_hash_bucket( "marital_status", hash_bucket_size=100) relationship = layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100) workclass = layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100) occupation = layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000) native_country = layers.sparse_column_with_hash_bucket( "native_country", hash_bucket_size=1000) # Continuous base columns. age = layers.real_valued_column("age") education_num = layers.real_valued_column("education_num") capital_gain = layers.real_valued_column("capital_gain") capital_loss = layers.real_valued_column("capital_loss") hours_per_week = layers.real_valued_column("hours_per_week") # Transformations. age_buckets = layers.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) education_occupation = layers.crossed_column([education, occupation], hash_bucket_size=int(1e4)) age_race_occupation = layers.crossed_column( [age_buckets, race, occupation], hash_bucket_size=int(1e6)) country_occupation = layers.crossed_column([native_country, occupation], hash_bucket_size=int(1e4)) # Wide columns and deep columns. wide_columns = [ gender, native_country, education, occupation, workclass, race, marital_status, relationship, age_buckets, education_occupation, age_race_occupation, country_occupation ] deep_columns = [ layers.embedding_column(gender, dimension=8), layers.embedding_column(native_country, dimension=8), layers.embedding_column(education, dimension=8), layers.embedding_column(occupation, dimension=8), layers.embedding_column(workclass, dimension=8), layers.embedding_column(race, dimension=8), layers.embedding_column(marital_status, dimension=8), layers.embedding_column(relationship, dimension=8), # layers.embedding_column(age_buckets, dimension=8), layers.embedding_column(education_occupation, dimension=8), layers.embedding_column(age_race_occupation, dimension=8), layers.embedding_column(country_occupation, dimension=8), age, education_num, capital_gain, capital_loss, hours_per_week, ] return wide_columns, deep_columns
values=table_species_weight_height['Species'].values dense_shape=[table_species_weight_height['Species'].size, 1] feature_cols['Species'] = tf.SparseTensor(indices, values, dense_shape) measured_heights = tf.constant(table_species_weight_height['Height'].values) return feature_cols, measured_heights # In[ ]: Weight = layers.real_valued_column("Weight") Species = layers.sparse_column_with_keys( column_name="Species", keys=['Goblin','Human','ManBears']) # In[ ]: reg = learn.LinearRegressor(feature_columns=[Weight,Species]) # In[ ]: reg.fit(input_fn=lambda:input_fn(table_species_weight_height), steps=25000)#steps=50000) # In[ ]:
def input_fn(df): feature_cols = {} feature_cols['Weight'] = tf.constant(df['Weight'].values) feature_cols['Species'] = tf.SparseTensor( indices=[[i, 0] for i in range(df['Species'].size)], values=df['Species'].values, dense_shape=[df['Species'].size, 1] ) labels = tf.constant(df['Height'].values) return feature_cols, labels from tensorflow.contrib import layers from tensorflow.contrib import learn Weight = layers.real_valued_column('Weight') Species = layers.sparse_column_with_keys(column_name='Species', keys=['Goblin', 'Human', 'MinBears']) reg = learn.LinearRegressor(feature_columns=[Weight, Species]) reg.fit(input_fn=lambda: input_fn(df), steps=50000) w_w = reg.get_variable_value('linear/Weight/weight') print('Estimation for Weight: {}'.format(w_w)) s_w = reg.get_variable_value('linear/Species/weights') b = reg.get_variable_value('linear/bias_weight') print('Estimation for Species: {}'.format(s_w + b)) ## 마지막 라인..
from tensorflow.contrib.learn.python.learn import DNNLinearCombinedClassifier, LinearClassifier from tensorflow.contrib.layers import bucketized_column, crossed_column, embedding_column, sparse_column_with_keys, sparse_column_with_hash_bucket, real_valued_column from tempfile import mkdtemp PATH_TO_DIRECTORY_OF_THIS_FILE = dirname(realpath(__file__)) PATH_TO_DIRECTORY_OF_INPUT_DATA = PATH_TO_DIRECTORY_OF_THIS_FILE + "/data/input" MODEL_DIR = PATH_TO_DIRECTORY_OF_THIS_FILE + "/classifier" CATEGORICAL_COLUMNS = ["admin_level", "country_code", "edit_distance", "has_mpoly", "has_pcode", "is_country", "is_highest_population", "is_lowest_admin_level", "matches_topic"] CONTINUOUS_COLUMNS = ["cluster_frequency", "country_rank", "median_distance", "population", "popularity"] LABEL_COLUMN = "correct" COLUMNS = sorted(CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS) + [LABEL_COLUMN] print "COLUMNS:", COLUMNS admin_level = sparse_column_with_keys(column_name="admin_level", keys=["None","0","1","2","3","4","5","6"]) # I've never seen admin 6, but you never know! cluster_frequency = real_valued_column("cluster_frequency") cluster_frequency_buckets = bucketized_column(cluster_frequency, boundaries=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]) country_code = sparse_column_with_hash_bucket("country_code", hash_bucket_size=500) country_rank = real_valued_column("country_rank") edit_distance = sparse_column_with_keys(column_name="edit_distance", keys=["0", "1", "2"]) has_pcode = sparse_column_with_keys(column_name="has_pcode", keys=["True", "False"]) has_mpoly = sparse_column_with_keys(column_name="has_mpoly", keys=["True", "False"]) is_country = sparse_column_with_keys(column_name="is_country", keys=["True", "False"]) is_lowest_admin_level = sparse_column_with_keys(column_name="is_lowest_admin_level", keys=["True", "False"]) is_highest_population = sparse_column_with_keys(column_name="is_highest_population", keys=["True", "False"]) matches_topic = sparse_column_with_keys(column_name="matches_topic", keys=["True", "False"]) median_distance = real_valued_column("median_distance") median_distance_buckets = bucketized_column(median_distance, boundaries=[10,50,100,200,300]) population = real_valued_column("population") population_buckets = bucketized_column(population, boundaries=[0, 1, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000])
feature_placeholders = dict(list(cont_feature_placeholders.items()) + list(cat_feature_placeholders.items())) features = {column : tensor for column, tensor in feature_placeholders.items()} label = None return InputFnOps(features, label, feature_placeholders) # # Binary classification # audit_df = load_csv("Audit.csv") audit_df["Adjusted"] = audit_df["Adjusted"].astype(int) audit_cont_columns = ["Age", "Income", "Deductions", "Hours"] audit_cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender"] audit_feature_columns = [real_valued_column(column, dtype = tf.float64) for column in audit_cont_columns] + [sparse_column_with_keys(column, dtype = tf.string, keys = sorted(audit_df[column].unique())) for column in audit_cat_columns] def audit_input_fn(): return _input_fn(audit_df, audit_cont_columns, audit_cat_columns, "Adjusted") def audit_serving_input_fn(): return _serving_input_fn(audit_cont_columns, audit_cat_columns) def build_audit(classifier, max_steps, name, with_proba = True): classifier.fit(input_fn = audit_input_fn, max_steps = max_steps) adjusted = DataFrame(classifier.predict(input_fn = audit_input_fn, as_iterable = False), columns = ["_target"]) if(with_proba): adjusted_proba = DataFrame(classifier.predict_proba(input_fn = audit_input_fn, as_iterable = False), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
UNUSED_COLUMNS = ['datetime'] INPUT_COLUMN_NAMES = { 'dayofweek': tf.string, 'hourofday': tf.int32, 'pickuplon': tf.float32, 'pickuplat': tf.float32, 'dropofflon': tf.float32, 'dropofflat': tf.float32, 'passengers': tf.int32 } # These are the raw input columns, and will be provided for prediction also INPUT_COLUMNS = [ # define features layers.sparse_column_with_keys( 'dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']), layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24), # engineered features that are created in the input_fn layers.real_valued_column('latdiff'), layers.real_valued_column('londiff'), layers.real_valued_column('euclidean'), # real_valued_column layers.real_valued_column('pickuplon'), layers.real_valued_column('pickuplat'), layers.real_valued_column('dropofflat'), layers.real_valued_column('dropofflon'), layers.real_valued_column('passengers'), ]
import tensorflow.contrib.learn as tflearn from tensorflow.contrib import metrics import numpy as np tf.logging.set_verbosity(tf.logging.INFO) CSV_COLUMNS = 'fare_amount,dayofweek,hourofday,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key'.split(',') SCALE_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers'] LABEL_COLUMN = 'fare_amount' KEY_FEATURE_COLUMN = 'key' DEFAULTS = [[0.0], ['Sun'], [0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']] # These are the raw input columns, and will be provided for prediction also INPUT_COLUMNS = [ # define features layers.sparse_column_with_keys('dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']), layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24), # engineered features that are created in the input_fn layers.real_valued_column('latdiff'), layers.real_valued_column('londiff'), layers.real_valued_column('euclidean'), # real_valued_column layers.real_valued_column('pickuplon'), layers.real_valued_column('pickuplat'), layers.real_valued_column('dropofflat'), layers.real_valued_column('dropofflon'), layers.real_valued_column('passengers'), ]
'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket'] CSV_COLUMN_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''], [0], [0], [0], [''], ['']] LABEL_COLUMN = 'income_bracket' LABELS = [' <=50K', ' >50K'] # Define the initial ingestion of each feature used by your model. # Additionally, provide metadata about the feature. INPUT_COLUMNS = [ # Categorical base columns # For categorical columns with known values we can provide lists # of values ahead of time. layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male']), layers.sparse_column_with_keys( column_name='race', keys=[ 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White' ] ), # Otherwise we can use a hashing function to bucket the categories layers.sparse_column_with_hash_bucket('education', hash_bucket_size=1000), layers.sparse_column_with_hash_bucket('marital_status', hash_bucket_size=100),
training_size = int(len(data) * 0.8) verification_size = len(data) - training_size randomized_data = data.sample(frac=1) training_examples = randomized_data.head(training_size)[FEATURES] training_targets = randomized_data.head(training_size)[[TARGET]] validation_examples = randomized_data.tail(verification_size)[FEATURES] validation_targets = randomized_data.tail(verification_size)[[TARGET]] STEPS = 5000 BATCH_SIZE = 5 periods = 1 feature_columns = [ layers.sparse_column_with_keys(column_name="sex", keys=["M", "F", "I"]) ] + ([layers.real_valued_column(name) for name in REAL_VALUED_FEATURES]) linear_regressor = learn.LinearRegressor( optimizer=tensorflow.train.GradientDescentOptimizer(0.05), feature_columns=feature_columns) def input_fn(features, target=None): """Input builder function.""" # Creates a dictionary mapping from each continuous feature column name (k) to # the values of that column stored in a constant Tensor. continuous_cols = { k: tensorflow.constant(features[k].values) for k in REAL_VALUED_FEATURES }
# Binary classification # audit_df = load_csv("Audit.csv") audit_df["Adjusted"] = audit_df["Adjusted"].astype(int) audit_cont_columns = ["Age", "Income", "Deductions", "Hours"] audit_cat_columns = [ "Employment", "Education", "Marital", "Occupation", "Gender" ] audit_feature_columns = [ real_valued_column(column, dtype=tf.float64) for column in audit_cont_columns ] + [ sparse_column_with_keys( column, dtype=tf.string, keys=sorted(audit_df[column].unique())) for column in audit_cat_columns ] def audit_input_fn(): return _input_fn(audit_df, audit_cont_columns, audit_cat_columns, "Adjusted") def audit_serving_input_fn(): return _serving_input_fn(audit_cont_columns, audit_cat_columns) def build_audit(classifier, name, with_proba=True): classifier.fit(input_fn=audit_input_fn, steps=2000)
indices=[[i,0] for i in range(df[k].size)], values=df[k].values, shape=[df[k].size,1]) for k in features} label=tensorflow.constant(df["tmp"].values) return col,label def train_input_fn(): return input_fn(data) def testing_fn(): return input_fn(data_test) data["tmp"]= data["class"].apply(help) data_test["tmp"]= data_test["class"].apply(help) buying = layers.sparse_column_with_keys(column_name="buying",keys=["low","med","high","vhigh"]) maint = layers.sparse_column_with_keys(column_name="maint",keys=["low","med","high","vhigh"]) doors = layers.sparse_column_with_keys(column_name="doors",keys=["2","3","4","5more"]) persons = layers.sparse_column_with_keys(column_name="persons",keys=["2","4","more"]) lug_boot = layers.sparse_column_with_keys(column_name="lug_boot",keys=["small","med","big"]) safety = layers.sparse_column_with_keys(column_name="safety",keys=["low","med","high"]) buying_emb = layers.embedding_column(buying,dimension=4) maint_emb = layers.embedding_column(maint,dimension=4) doors_emb = layers.embedding_column(doors,dimension=4) persons_emb = layers.embedding_column(persons,dimension=3) lug_boot_emb = layers.embedding_column(lug_boot,dimension=3) safety_emb = layers.embedding_column(safety,dimension=3) dnn_classifier = learn.DNNClassifier(feature_columns=[buying_emb, maint_emb,doors_emb,persons_emb,lug_boot_emb,safety_emb], hidden_units=[10], n_classes=4, ) #dnn_classifier.fit(X_train, y_train, steps = 1000)