def wide_and_deep(output_dir, nbuckets=5, hidden_units='64,16,4', learning_rate=0.01): real, sparse = get_features() hidden_units = hidden_units.split(',') hidden_units = list(map(int, hidden_units)) print(".........................", hidden_units) # bucketise/discretise lat and lon to nbuckets latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist() # USA lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA disc = {} disc.update({ 'd_{}'.format(key): tflayers.bucketized_column(real[key], latbuckets) \ for key in ['dep_lat', 'arr_lat'] }) disc.update({ 'd_{}'.format(key): tflayers.bucketized_column(real[key], lonbuckets) \ for key in ['dep_lon', 'arr_lon'] }) # cross columns for new features sparse['dep_loc'] = tflayers.crossed_column( [disc['d_dep_lat'], disc['d_dep_lon']], nbuckets * nbuckets) sparse['arr_loc'] = tflayers.crossed_column( [disc['d_arr_lat'], disc['d_arr_lon']], nbuckets * nbuckets) sparse['dep_arr'] = tflayers.crossed_column( [sparse['dep_loc'], sparse['arr_loc']], nbuckets**4) sparse['ori_dest'] = tflayers.crossed_column( [sparse['origin'], sparse['dest']], hash_bucket_size=1000) # checkpoint # create embeddings of all the sparse columns embed = { colname : create_embed(col) \ for colname, col in sparse.items() } real.update(embed) estimator = tflearn.DNNLinearCombinedClassifier( model_dir=output_dir, linear_feature_columns=sparse.values(), dnn_feature_columns=real.values(), dnn_hidden_units=hidden_units) estimator.params["head"]._thresholds = [0.7] return estimator
def build_estimator(model_dir, model_type): """build an estimator""" # base sparse feature process gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male']) education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000) relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100) workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100) occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000) native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000) # base continuous feature age = layers.real_valued_column(column_name='age') education_num = layers.real_valued_column(column_name='education_num') capital_gain = layers.real_valued_column(column_name='capital_gain') capital_loss = layers.real_valued_column(column_name='capital_loss') hours_per_week = layers.real_valued_column(column_name='hours_per_week') # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性 age_bucket = layers.bucketized_column(source_column=age, boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65]) # wide columns and deep columns # 深度模型使用到的特征和广度模型使用到的特征 # 广度模型特征只只用到了分类标签 wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket, layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)), layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)), layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))] deep_columns = [layers.embedding_column(workclass, dimension=8), layers.embedding_column(education, dimension=8), layers.embedding_column(gender, dimension=8), layers.embedding_column(relationship, dimension=8), layers.embedding_column(native_country, dimension=8), layers.embedding_column(occupation, dimension=8), age, education_num, capital_gain, capital_loss, hours_per_week] if model_type == "wide": m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir) elif model_type == "deep": m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50]) else: m=learn.DNNLinearCombinedClassifier(model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[256, 128, 64], dnn_activation_fn=tf.nn.relu) return m
def wide_and_deep_model(output_dir, nbuckets=5, hidden_units='64,32', learning_rate=0.01): real, sparse = get_features() # the lat/lon columns can be discretized to yield "air traffic corridors" latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist() # USA lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA disc = {} disc.update({ 'd_{}'.format(key) : tflayers.bucketized_column(real[key], latbuckets) \ for key in ['dep_lat', 'arr_lat'] }) disc.update({ 'd_{}'.format(key) : tflayers.bucketized_column(real[key], lonbuckets) \ for key in ['dep_lon', 'arr_lon'] }) # cross columns that make sense in combination sparse['dep_loc'] = tflayers.crossed_column([disc['d_dep_lat'], disc['d_dep_lon']],\ nbuckets*nbuckets) sparse['arr_loc'] = tflayers.crossed_column([disc['d_arr_lat'], disc['d_arr_lon']],\ nbuckets*nbuckets) sparse['dep_arr'] = tflayers.crossed_column([sparse['dep_loc'], sparse['arr_loc']],\ nbuckets ** 4) sparse['ori_dest'] = tflayers.crossed_column([sparse['origin'], sparse['dest']], \ hash_bucket_size=1000) # create embeddings of all the sparse columns embed = { colname : create_embed(col) \ for colname, col in sparse.items() } real.update(embed) estimator = \ tflearn.DNNLinearCombinedClassifier(model_dir=output_dir, linear_feature_columns=sparse.values(), dnn_feature_columns=real.values(), dnn_hidden_units=parse_hidden_units(hidden_units)) #linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate), #dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate*0.25)) estimator.params["head"]._thresholds = [0.7] # FIXME: hack return estimator
print >> sys.stderr, " ", cid, C cid += 1 ttrn = [class2id[C] for C in ttrn] ttst = [class2id[C] for C in ttst] #classifier = lrn.DNNClassifier(feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])], # hidden_units=[10, 20, 10], # n_classes=n_classes) #classifier = lrn.DNNLinearCombinedClassifier(dnn_feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])], # dnn_hidden_units=[10,20,10], # n_classes=n_classes) classifier = lrn.DNNLinearCombinedClassifier( linear_feature_columns=[ tf.contrib.layers.real_valued_column("", dimension=snps.shape[1]) ], #dnn_hidden_units=[10,20,10], n_classes=n_classes) #classifier = lrn.LinearClassifier(feature_columns=[tf.contrib.layers.real_valued_column("", dimension=snps.shape[1])], n_classes=n_classes) classifier.fit(strn, ttrn, steps=50) pred = classifier.predict(stst) #setup counters count = 0 correct = 0 classCount = dict() classCorrect = dict() for C in classes: classCount[C] = 0 classCorrect[C] = 0
def build_estimator(model_dir, classifier): # Categorical columns sex = tf.contrib.layers.sparse_column_with_keys(column_name="Sex", keys=["female", "male"]) family = tf.contrib.layers.sparse_column_with_keys(column_name="Family", keys=["Large", "Nuclear", "Solo"]) child = tf.contrib.layers.sparse_column_with_keys(column_name="Child", keys=["0", "1"]) ageknown = tf.contrib.layers.sparse_column_with_keys(column_name="AgeKnown", keys=["0", "1"]) embarked = tf.contrib.layers.sparse_column_with_keys(column_name="Embarked", keys=["C", "S", "Q"]) young = tf.contrib.layers.sparse_column_with_keys(column_name="Young", keys=["0", "1"]) malebadticket = tf.contrib.layers.sparse_column_with_keys(column_name="MaleBadTicket", keys=["0", "1"]) cab = tf.contrib.layers.sparse_column_with_hash_bucket( "Cab", hash_bucket_size=10) namet = tf.contrib.layers.sparse_column_with_hash_bucket( "NameT", hash_bucket_size=20) # Continuous columns age = tf.contrib.layers.real_valued_column("Age") namelength = tf.contrib.layers.real_valued_column("NameLength") fare = tf.contrib.layers.real_valued_column("Fare") p_class = tf.contrib.layers.real_valued_column("Pclass") # Transformations. fare_buckets = tf.contrib.layers.bucketized_column(fare, boundaries=[ 5, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550 ]) age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[ 5, 18, 25, 30, 35, 40, 45, 50, 55, 65 ]) pclass_buckets = tf.contrib.layers.bucketized_column(p_class, boundaries=[1, 2, 3]) # Wide columns and deep columns. wide_columns = [sex, cab, namet, child, ageknown, embarked, young, family, tf.contrib.layers.crossed_column( [age_buckets, sex], hash_bucket_size=int(1e3)), tf.contrib.layers.crossed_column( [pclass_buckets, sex], hash_bucket_size=int(1e3)), tf.contrib.layers.crossed_column( [fare_buckets, pclass_buckets], hash_bucket_size=int(1e3)), tf.contrib.layers.crossed_column( [embarked, pclass_buckets], hash_bucket_size=int(1e3)), tf.contrib.layers.crossed_column( [embarked, sex], hash_bucket_size=int(1e3))] deep_columns = [ namelength, fare, p_class, tf.contrib.layers.embedding_column(sex, dimension=8), tf.contrib.layers.embedding_column(child, dimension=8), tf.contrib.layers.embedding_column(family, dimension=8), tf.contrib.layers.embedding_column(cab, dimension=8), tf.contrib.layers.embedding_column(namet, dimension=8), tf.contrib.layers.embedding_column(ageknown, dimension=8), tf.contrib.layers.embedding_column(embarked, dimension=8), tf.contrib.layers.embedding_column(young, dimension=8), tf.contrib.layers.embedding_column(malebadticket, dimension=8) ] if classifier == "deep": return Learn.DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=[32, 16], optimizer=tf.train.ProximalAdagradOptimizer( learning_rate=0.1, l2_regularization_strength=0.001)) elif classifier == "wide": return Learn.LinearClassifier( feature_columns=wide_columns, optimizer=tf.train.FtrlOptimizer( learning_rate=5, l1_regularization_strength=1000.0, l2_regularization_strength=1000.0), model_dir=model_dir) else: return Learn.DNNLinearCombinedClassifier( linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[32, 16], model_dir=model_dir, linear_optimizer=tf.train.FtrlOptimizer( learning_rate=10, l1_regularization_strength=100.0, l2_regularization_strength=100.0), dnn_optimizer=tf.train.ProximalAdagradOptimizer( learning_rate=0.1, l2_regularization_strength=0.001))
validation_set_pca_sc_scaled, "Standard-Scaled->PCA->Random Forest")) pipelines.append((RandomForestClassifier(), training_set_imputed, validation_set_imputed, "Imputed->Random Forest")) pipelines.append((RandomForestClassifier(), training_set_mn_scaled_imputed, validation_set_mn_scaled_imputed, "Imputed->MinMax-Scaled->Random Forest")) pipelines.append((RandomForestClassifier(), training_set_sc_scaled_imputed, validation_set_sc_scaled_imputed, "Imputed->Standard-Scaled->Random Forest")) pipelines.append((RandomForestClassifier(), training_set_pca_sc_scaled_imputed, validation_set_pca_sc_scaled_imputed, "Imputed->Standard-Scaled->PCA->Random Forest")) fc = [layers.real_valued_column("", dimension=len(training_set.columns))] classifier_lc = learn.LinearClassifier(feature_columns=fc, n_classes=2) classifier_dlc = learn.DNNLinearCombinedClassifier(linear_feature_columns=fc, n_classes=2) classifier_dc = learn.DNNClassifier(feature_columns=fc, n_classes=2, hidden_units=[1000, 300, 200]) fc_imputed = [ layers.real_valued_column("", dimension=len(training_set_imputed.columns)) ] classifier_lc_imputed = learn.LinearClassifier(feature_columns=fc_imputed, n_classes=2) classifier_dlc_imputed = learn.DNNLinearCombinedClassifier( linear_feature_columns=fc_imputed, n_classes=2) classifier_dc_imputed = learn.DNNClassifier(feature_columns=fc_imputed, n_classes=2, hidden_units=[1000, 300, 200])