def get_wide_deep(): # define column types races = ['White', 'Black', 'American Indian', 'Chinese', 'Japanese', 'Hawaiian', 'Filipino', 'Unknown', 'Asian Indian', 'Korean', 'Samaon', 'Vietnamese'] is_male,mother_age,mother_race,plurality,gestation_weeks,mother_married,cigarette_use,alcohol_use = \ [ \ tflayers.sparse_column_with_keys('is_male', keys=['True', 'False']), tflayers.real_valued_column('mother_age'), tflayers.sparse_column_with_keys('mother_race', keys=races), tflayers.real_valued_column('plurality'), tflayers.real_valued_column('gestation_weeks'), tflayers.sparse_column_with_keys('mother_married', keys=['True', 'False']), tflayers.sparse_column_with_keys('cigarette_use', keys=['True', 'False', 'None']), tflayers.sparse_column_with_keys('alcohol_use', keys=['True', 'False', 'None']) ] # which columns are wide (sparse, linear relationship to output) and which are deep (complex relationship to output?) wide = [is_male, mother_race, plurality, mother_married, cigarette_use, alcohol_use] deep = [\ mother_age, gestation_weeks, tflayers.embedding_column(mother_race, 3) ] return wide, deep
def testLinearlySeparableBinaryDataNoKernels(self): """Tests classifier w/o kernels (log. regression) for lin-separable data.""" feature1 = layers.real_valued_column('feature1') feature2 = layers.real_valued_column('feature2') logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[feature1, feature2]) logreg_classifier.fit(input_fn=_linearly_separable_binary_input_fn, steps=100) metrics = logreg_classifier.evaluate( input_fn=_linearly_separable_binary_input_fn, steps=1) # Since the data is linearly separable, the classifier should have small # loss and perfect accuracy. self.assertLess(metrics['loss'], 0.1) self.assertEqual(metrics['accuracy'], 1.0) # As a result, it should assign higher probability to class 1 for the 1st # and 3rd example and higher probability to class 0 for the second example. logreg_prob_predictions = list( logreg_classifier.predict_proba( input_fn=_linearly_separable_binary_input_fn)) self.assertGreater(logreg_prob_predictions[0][1], 0.5) self.assertGreater(logreg_prob_predictions[1][0], 0.5) self.assertGreater(logreg_prob_predictions[2][1], 0.5)
def testLinearlySeparableBinaryDataNoKernels(self): """Tests classifier w/o kernels (log. regression) for lin-separable data.""" feature1 = layers.real_valued_column('feature1') feature2 = layers.real_valued_column('feature2') logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[feature1, feature2]) logreg_classifier.fit( input_fn=_linearly_separable_binary_input_fn, steps=100) metrics = logreg_classifier.evaluate( input_fn=_linearly_separable_binary_input_fn, steps=1) # Since the data is linearly separable, the classifier should have small # loss and perfect accuracy. self.assertLess(metrics['loss'], 0.1) self.assertEqual(metrics['accuracy'], 1.0) # As a result, it should assign higher probability to class 1 for the 1st # and 3rd example and higher probability to class 0 for the second example. logreg_prob_predictions = list( logreg_classifier.predict_proba(input_fn= _linearly_separable_binary_input_fn)) self.assertGreater(logreg_prob_predictions[0][1], 0.5) self.assertGreater(logreg_prob_predictions[1][0], 0.5) self.assertGreater(logreg_prob_predictions[2][1], 0.5)
def build_estimator(model_dir, model_type): """build an estimator""" # base sparse feature process gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male']) education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000) relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100) workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100) occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000) native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000) # base continuous feature age = layers.real_valued_column(column_name='age') education_num = layers.real_valued_column(column_name='education_num') capital_gain = layers.real_valued_column(column_name='capital_gain') capital_loss = layers.real_valued_column(column_name='capital_loss') hours_per_week = layers.real_valued_column(column_name='hours_per_week') # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性 age_bucket = layers.bucketized_column(source_column=age, boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65]) # wide columns and deep columns # 深度模型使用到的特征和广度模型使用到的特征 # 广度模型特征只只用到了分类标签 wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket, layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)), layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)), layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))] deep_columns = [layers.embedding_column(workclass, dimension=8), layers.embedding_column(education, dimension=8), layers.embedding_column(gender, dimension=8), layers.embedding_column(relationship, dimension=8), layers.embedding_column(native_country, dimension=8), layers.embedding_column(occupation, dimension=8), age, education_num, capital_gain, capital_loss, hours_per_week] if model_type == "wide": m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir) elif model_type == "deep": m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50]) else: m=learn.DNNLinearCombinedClassifier(model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[256, 128, 64], dnn_activation_fn=tf.nn.relu) return m
def testInvalidNumberOfClasses(self): """ValueError raised when the kernel mappers provided have invalid type.""" feature = layers.real_valued_column('feature') with self.assertRaises(ValueError): _ = kernel_estimators.KernelLinearClassifier( feature_columns=[feature], n_classes=1)
def testMulticlassDataWithAndWithoutKernels(self): """Tests classifier w/ and w/o kernels on multiclass data.""" feature_column = layers.real_valued_column('feature', dimension=4) # Metrics for linear classifier (no kernels). linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[feature_column], n_classes=3) linear_classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=50) linear_metrics = linear_classifier.evaluate( input_fn=test_data.iris_input_multiclass_fn, steps=1) linear_loss = linear_metrics['loss'] linear_accuracy = linear_metrics['accuracy'] # Using kernel mappers allows to discover non-linearities in data (via RBF # kernel approximation), reduces loss and increases accuracy. kernel_mappers = { feature_column: [ RandomFourierFeatureMapper(input_dim=4, output_dim=50, stddev=1.0, name='rffm') ] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], n_classes=3, kernel_mappers=kernel_mappers) kernel_linear_classifier.fit( input_fn=test_data.iris_input_multiclass_fn, steps=50) kernel_linear_metrics = kernel_linear_classifier.evaluate( input_fn=test_data.iris_input_multiclass_fn, steps=1) kernel_linear_loss = kernel_linear_metrics['loss'] kernel_linear_accuracy = kernel_linear_metrics['accuracy'] self.assertLess(kernel_linear_loss, linear_loss) self.assertGreater(kernel_linear_accuracy, linear_accuracy)
def main(): # If the training and test sets aren't stored locally, download them. if not os.path.exists(IRIS_TRAINING): raw = urlopen(IRIS_TRAINING_URL).read() with open(IRIS_TRAINING, "wb") as f: f.write(raw) if not os.path.exists(IRIS_TEST): raw = urlopen(IRIS_TEST_URL).read() with open(IRIS_TEST, "wb") as f: f.write(raw) # Load datasets. training_set = load_csv_with_header(filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = load_csv_with_header(filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) # Specify that all features have real-value data feature_columns = [real_valued_column("", dimension=4)] # Build 3 layer DNN with 10, 20, 10 units respectively. classifier = DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir="/tmp/iris_model") # Define the training inputs def get_train_inputs(): x = tf.constant(training_set.data) y = tf.constant(training_set.target) return x, y # Fit model. classifier.fit(input_fn=get_train_inputs, steps=2000) # Define the test inputs def get_test_inputs(): x = tf.constant(test_set.data) y = tf.constant(test_set.target) return x, y # Evaluate accuracy. accuracy_score = classifier.evaluate(input_fn=get_test_inputs, steps=1)["accuracy"] print("\nTest Accuracy: {0:f}\n".format(accuracy_score)) # Classify two new flower samples. def new_samples(): return np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) predictions = list(classifier.predict(input_fn=new_samples)) print("New Samples, Class Predictions: {}\n".format(predictions))
def get_conv_classifier(): n_classes = 5 feature_columns = [layers.real_valued_column("", dimension=3)] # learning_rate = 1.0 # optimizer = AdagradOptimizer(learning_rate) # # learning_rate = 1.0 # optimizer = AdadeltaOptimizer(learning_rate=learning_rate) # ~ 62.55% learning_rate = 0.01 optimizer = AdamOptimizer(learning_rate, epsilon=0.1) # learning_rate = 0.05 # optimizer = GradientDescentOptimizer(learning_rate) # learning_rate = 0.1 # optimizer = RMSPropOptimizer(learning_rate, momentum=0.1) # learning_rate = 0.1 # optimizer = FtrlOptimizer(learning_rate) return SKCompat(Estimator( model_fn=get_conv_model, params={ 'head': head_lib._multi_class_head( # pylint: disable=protected-access n_classes, enable_centered_bias=False), 'feature_columns': feature_columns, 'activation_fn': tf.nn.relu, 'learning_rate': learning_rate, 'optimizer': optimizer }, model_dir='saved_model'))
def get_feature_columns(self): """Get a list of feature column names.""" feature_columns = [ 'idx_{}.coef_{:.3f}'.format(i, self._coefficients[i]) for i in range(self._num_feature) ] return [contrib_layers.real_valued_column(fc) for fc in feature_columns]
def _add_bias_column(feature_columns, columns_to_tensors, bias_variable, columns_to_variables): """Adds a fake bias feature column filled with all 1s.""" # TODO(b/31008490): Move definition to a common constants place. bias_column_name = "tf_virtual_bias_column" if any(col.name is bias_column_name for col in feature_columns): raise ValueError("%s is a reserved column name." % bias_column_name) if not feature_columns: raise ValueError("feature_columns can't be empty.") # Loop through input tensors until we can figure out batch_size. batch_size = None for column in columns_to_tensors.values(): if isinstance(column, tuple): column = column[0] if isinstance(column, sparse_tensor.SparseTensor): shape = tensor_util.constant_value(column.dense_shape) if shape is not None: batch_size = shape[0] break else: batch_size = array_ops.shape(column)[0] break if batch_size is None: raise ValueError("Could not infer batch size from input features.") bias_column = layers.real_valued_column(bias_column_name) columns_to_tensors[bias_column] = array_ops.ones([batch_size, 1], dtype=dtypes.float32) columns_to_variables[bias_column] = [bias_variable]
def testMulticlassDataWithAndWithoutKernels(self): """Tests classifier w/ and w/o kernels on multiclass data.""" feature_column = layers.real_valued_column('feature', dimension=4) # Metrics for linear classifier (no kernels). linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[feature_column], n_classes=3) linear_classifier.fit(input_fn=test_data.iris_input_multiclass_fn, steps=50) linear_metrics = linear_classifier.evaluate( input_fn=test_data.iris_input_multiclass_fn, steps=1) linear_loss = linear_metrics['loss'] linear_accuracy = linear_metrics['accuracy'] # Using kernel mappers allows to discover non-linearities in data (via RBF # kernel approximation), reduces loss and increases accuracy. kernel_mappers = { feature_column: [ RandomFourierFeatureMapper( input_dim=4, output_dim=50, stddev=1.0, name='rffm') ] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], n_classes=3, kernel_mappers=kernel_mappers) kernel_linear_classifier.fit( input_fn=test_data.iris_input_multiclass_fn, steps=50) kernel_linear_metrics = kernel_linear_classifier.evaluate( input_fn=test_data.iris_input_multiclass_fn, steps=1) kernel_linear_loss = kernel_linear_metrics['loss'] kernel_linear_accuracy = kernel_linear_metrics['accuracy'] self.assertLess(kernel_linear_loss, linear_loss) self.assertGreater(kernel_linear_accuracy, linear_accuracy)
def CustomerTrainTask(self, dataset_id, model_id, user_id, **kwargs): steps = 2000 dataset = DatasetModel.query().get(dataset_id) csv_file = StringIO(dataset.data) training_set = load_dataset(csv_file) model_attrs = {} if "dimension" in kwargs: model_attrs["feature_columns"] = [ real_valued_column("", dimension=kwargs['dimension']) ] if "hidden_units" in kwargs: model_attrs['hidden_units'] = map(int, kwargs['hidden_units']) if "n_classes" in kwargs: model_attrs['n_classes'] = kwargs['n_classes'] model_db = MLMethod.query().get(model_id) model = loads(model_db.data) temp_folder = mkdtemp() fd, filepath = mkstemp(suffix="tar.gz") try: classifier = model(model_dir=temp_folder, **model_attrs) classifier = classifier.fit(x=training_set.data, y=training_set.target, steps=steps) tar = tarfile.open(filepath, "w:gz") tar.add(temp_folder, arcname="model") tar.close() with open(filepath, "rb") as fout: trained = MLMethod( user_id=user_id, name=model_db.name, description="%s 在 %s 上的模型" % (model_db.name, dataset.name), public=model_db.public, trained=True, data=dumps((CustomerPredictTask, model, kwargs, fout.read()), HIGHEST_PROTOCOL)) trained.save_object() MethodKwargs( model_id=trained.id, name="file", label="数据文件", description="数据文件和文本数据选其一即可", required=False, type="file", ).save_object() MethodKwargs( model_id=trained.id, name="data", label="数据文本", description="数据文件和文本数据选其一即可", required=False, type="str", ).save_object() finally: # 删这个文件会报错,所以干脆不删好了。。 pass # os.unlink(filepath) # rmtree(temp_folder) # MethodKwargs(model_id) return trained.id
def _add_bias_column(feature_columns, columns_to_tensors, bias_variable, targets, columns_to_variables): # TODO(b/31008490): Move definition to a common constants place. bias_column_name = "tf_virtual_bias_column" if any(col.name is bias_column_name for col in feature_columns): raise ValueError("%s is a reserved column name." % bias_column_name) bias_column = layers.real_valued_column(bias_column_name) columns_to_tensors[bias_column] = array_ops.ones_like(targets, dtype=dtypes.float32) columns_to_variables[bias_column] = [bias_variable]
def get_feature_column(mode): feature_columns = [] feature_columns.append(layers.real_valued_column( column_name = 'res', dimension = TEXT_FEATURE_SIZE, dtype = tf.int64)) feature_columns.append(layers.real_valued_column( column_name = 'res_len', dimension = 1, dtype = tf.int64)) feature_columns.append(layers.real_valued_column( column_name = 'utters', dimension = TEXT_FEATURE_SIZE*TURN_FEATURE_SIZE, dtype = tf.int64)) feature_columns.append(layers.real_valued_column( column_name = 'utters_len', dimension = TURN_FEATURE_SIZE, dtype = tf.int64)) if mode == learn.ModeKeys.TRAIN: feature_columns.append(layers.real_valued_column( column_name = 'label', dimension = 1, dtype = tf.int64)) elif mode == learn.ModeKeys.EVAL: for i in xrange(DISTRACTOR_COUNT): feature_columns.append(layers.real_valued_column( column_name = 'distractor_{}'.format(i), dimension = TEXT_FEATURE_SIZE, dtype = tf.int64)) feature_columns.append(layers.real_valued_column( column_name = 'distractor_{}_len'.format(i), dimension = 1, dtype = tf.int64)) #print('feature_columns=%s' % (feature_columns)) return set(feature_columns)
def get_wide_deep(): # define column types StyleName,quantity, demand, org_ret_price,sell_price, margin, off_orig_retail, total_ots = \ [ \ tflayers.sparse_column_with_hash_bucket('Style_Name', hash_bucket_size = 1000), tflayers.real_valued_column('Quantity'), tflayers.real_valued_column('Demand'), tflayers.real_valued_column('Original_Retail_Price'), tflayers.real_valued_column('Selling_Price'), tflayers.real_valued_column('Margin'), tflayers.real_valued_column('off_Orig_Retail'), tflayers.real_valued_column('Total_OTS'), ] # which columns are wide (sparse, linear relationship to output) and which are deep (complex relationship to output?) wide = [StyleName,quantity, demand] deep = [\ org_ret_price, sell_price, margin, off_orig_retail, total_ots, tflayers.embedding_column(StyleName, 3) ] return wide, deep
def get_features(): # Using three basic inputs real = { colname : tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance').split(',') } sparse = {} return real, sparse
def get_features_ch8(): # Using the basic three inputs plus calculated time averages real = { colname: tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay').split(',') } sparse = {} return real, sparse
def get_feature_columns(self): """Get a list of feature column names.""" num_feature = self._num_pair * 2 x1_col = ['xorpair_{}.idx_{}'.format(i, i) for i in range(self._num_pair)] x2_col = [ 'xorpair_{}.idx_{}'.format(i - self._num_pair, i) for i in range(self._num_pair, num_feature) ] return [contrib_layers.real_valued_column(fc) for fc in x1_col + x2_col]
def get_features_ch7(): """Using only the three inputs we originally used in Chapter 7""" real = { colname : tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance').split(',') } sparse = {} return real, sparse
def get_features_ch8(): """Using the three inputs we originally used in Chapter 7, plus the time averages computed in Chapter 8""" real = { colname : tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay').split(',') } sparse = {} return real, sparse
def testInvalidKernelMapper(self): """ValueError raised when the kernel mappers provided have invalid type.""" class DummyKernelMapper(object): def __init__(self): pass feature = layers.real_valued_column('feature') kernel_mappers = {feature: [DummyKernelMapper()]} with self.assertRaises(ValueError): _ = kernel_estimators.KernelLinearClassifier( feature_columns=[feature], kernel_mappers=kernel_mappers)
def get_classifier(): # (kernel_size * kernel_size, 3) feature_columns = [layers.real_valued_column("", dimension=3)] return DNNClassifier(feature_columns=feature_columns, hidden_units=[256, 128], n_classes=5, model_dir="saved_model", # optimizer=AdadeltaOptimizer(learning_rate=0.1) # optimizer=AdamOptimizer() # dropout=0.5 )
def _dnn_feature_columns(feature_columns): """ generate dnn feature columns """ dnn_columns = [] for col in feature_columns: dnn_col = real_valued_column(col, dtype=tf.float64) if isinstance(col, _SparseColumnKeys): dnn_columns.append(one_hot_column(dnn_col)) else: dnn_columns.append(dnn_col) return dnn_columns
def get_classifier(): # (kernel_size * kernel_size, 3) feature_columns = [layers.real_valued_column("", dimension=3)] return DNNClassifier( feature_columns=feature_columns, hidden_units=[256, 128], n_classes=5, model_dir="saved_model", # optimizer=AdadeltaOptimizer(learning_rate=0.1) # optimizer=AdamOptimizer() # dropout=0.5 )
def get_feature_columns(self): """Get a list of feature column names.""" out = [] count, group = 0, 0 for order in self._orders: for group_idx in range(self._num_group_per_order): for _ in range(order): out.append('mult_group_{}.idx_{}.order_{}.group_coef_{:.3}'.format( group, count, order, self._group_coefficients_by_order[order][group_idx])) count += 1 group += 1 return [contrib_layers.real_valued_column(fc) for fc in out]
def testExtractFeaturesWithTransformation(self): """Tests feature extraction.""" with self.test_session(): features = {} features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32) features["sparse_float"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.float32), array_ops.zeros([2], dtypes.int64)) features["sparse_categorical"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.string), array_ops.zeros([2], dtypes.int64)) feature_columns = set() feature_columns.add(layers.real_valued_column("dense_float")) feature_columns.add( layers.feature_column._real_valued_var_len_column( "sparse_float", is_sparse=True)) feature_columns.add( feature_column_lib.sparse_column_with_hash_bucket( "sparse_categorical", hash_bucket_size=1000000)) (fc_names, dense_floats, sparse_float_indices, sparse_float_values, sparse_float_shapes, sparse_int_indices, sparse_int_values, sparse_int_shapes) = (gbdt_batch.extract_features( features, feature_columns)) self.assertEqual(len(fc_names), 3) self.assertAllEqual( fc_names, ["dense_float", "sparse_float", "sparse_categorical"]) self.assertEqual(len(dense_floats), 1) self.assertEqual(len(sparse_float_indices), 1) self.assertEqual(len(sparse_float_values), 1) self.assertEqual(len(sparse_float_shapes), 1) self.assertEqual(len(sparse_int_indices), 1) self.assertEqual(len(sparse_int_values), 1) self.assertEqual(len(sparse_int_shapes), 1) self.assertAllEqual(dense_floats[0].eval(), features["dense_float"].eval()) self.assertAllEqual(sparse_float_indices[0].eval(), features["sparse_float"].indices.eval()) self.assertAllEqual(sparse_float_values[0].eval(), features["sparse_float"].values.eval()) self.assertAllEqual(sparse_float_shapes[0].eval(), features["sparse_float"].dense_shape.eval()) self.assertAllEqual(sparse_int_indices[0].eval(), features["sparse_categorical"].indices.eval()) self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263]) self.assertAllEqual( sparse_int_shapes[0].eval(), features["sparse_categorical"].dense_shape.eval())
def get_features_raw(): real = { colname : tflayers.real_valued_column(colname) \ for colname in \ ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + ',dep_lat,dep_lon,arr_lat,arr_lon').split(',') } sparse = { 'carrier': tflayers.sparse_column_with_keys('carrier', keys='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(',')), 'origin' : tflayers.sparse_column_with_hash_bucket('origin', hash_bucket_size=1000), # FIXME 'dest' : tflayers.sparse_column_with_hash_bucket('dest', hash_bucket_size=1000) #FIXME } return real, sparse
def contrib_learn_classifier_test(): """Test tf.contrib.learn.DNN_classifier.""" language_column = layers.sparse_column_with_hash_bucket( "language", hash_bucket_size=20) feature_columns = [ layers.embedding_column(language_column, dimension=3), layers.real_valued_column("age", dtype=tf.int64) ] classifier = learn.DNNClassifier( n_classes=3, feature_columns=feature_columns, hidden_units=[100, 100], config=learn.RunConfig(tf_random_seed=1, model_dir="../model_saver/estimators/" "DNN_classifier_01"), # optimizer=optimizer_exp_decay ) classifier.fit(input_fn=_input_fn, steps=10000) print("variables_names:\n", str(classifier.get_variable_names())) # scores = classifier.evaluate(input_fn=_input_fn, # steps=100) # print("scores:\n", str(scores)) scores = classifier.evaluate( input_fn=_input_fn, steps=100, metrics={ 'my_accuracy': MetricSpec(metric_fn=metrics.streaming_accuracy, prediction_key="classes"), 'my_precision': MetricSpec(metric_fn=metrics.streaming_precision, prediction_key="classes"), 'my_recall': MetricSpec(metric_fn=metrics.streaming_recall, prediction_key="classes"), 'my_metric': MetricSpec(metric_fn=my_metric_op, prediction_key="classes") }) print("scores:\n", str(scores)) predictions = classifier.predict(input_fn=_input_fn, outputs=["classes", "probabilities"]) print("predictions") for prediction in predictions: print(prediction)
def testExtractFeaturesWithTransformation(self): """Tests feature extraction.""" with self.test_session(): features = {} features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32) features["sparse_float"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.float32), array_ops.zeros([2], dtypes.int64)) features["sparse_categorical"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros( [2], dtypes.string), array_ops.zeros([2], dtypes.int64)) feature_columns = set() feature_columns.add(layers.real_valued_column("dense_float")) feature_columns.add( layers.feature_column._real_valued_var_len_column( "sparse_float", is_sparse=True)) feature_columns.add( feature_column_lib.sparse_column_with_hash_bucket( "sparse_categorical", hash_bucket_size=1000000)) (fc_names, dense_floats, sparse_float_indices, sparse_float_values, sparse_float_shapes, sparse_int_indices, sparse_int_values, sparse_int_shapes) = (gbdt_batch.extract_features( features, feature_columns)) self.assertEqual(len(fc_names), 3) self.assertAllEqual(fc_names, ["dense_float", "sparse_float", "sparse_categorical"]) self.assertEqual(len(dense_floats), 1) self.assertEqual(len(sparse_float_indices), 1) self.assertEqual(len(sparse_float_values), 1) self.assertEqual(len(sparse_float_shapes), 1) self.assertEqual(len(sparse_int_indices), 1) self.assertEqual(len(sparse_int_values), 1) self.assertEqual(len(sparse_int_shapes), 1) self.assertAllEqual(dense_floats[0].eval(), features["dense_float"].eval()) self.assertAllEqual(sparse_float_indices[0].eval(), features["sparse_float"].indices.eval()) self.assertAllEqual(sparse_float_values[0].eval(), features["sparse_float"].values.eval()) self.assertAllEqual(sparse_float_shapes[0].eval(), features["sparse_float"].dense_shape.eval()) self.assertAllEqual(sparse_int_indices[0].eval(), features["sparse_categorical"].indices.eval()) self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263]) self.assertAllEqual(sparse_int_shapes[0].eval(), features["sparse_categorical"].dense_shape.eval())
def part4(): global boston, x_data, y_data import pandas as pd import numpy as np N = 10000 weight = np.random.randn(N) * 5 + 70 spec_id = np.random.randint(0, 3, N) bias = [0.9, 1, 1.1] height = np.array( [weight[i] / 100 + bias[b] for i, b in enumerate(spec_id)]) spec_name = ['Goblin', 'Human', 'ManBear'] spec = [spec_name[s] for s in spec_id] df = pd.DataFrame({'Species': spec, 'Weight': weight, 'Height': height}) from tensorflow.contrib import layers Weight = layers.real_valued_column("Weight") Species = layers.sparse_column_with_keys(column_name="Species", keys=spec_name) reg = learn.LinearRegressor(feature_columns=[Weight, Species]) def input_fn(df): feature_cols = {} feature_cols['Weight'] = tf.constant(df['Weight'].values) feature_cols['Species'] = tf.SparseTensor( indices=[[i, 0] for i in range(df['Species'].size)], values=df['Species'].values, dense_shape=[df['Species'].size, 1]) labels = tf.constant(df['Height'].values) return feature_cols, labels reg.fit(input_fn=lambda: input_fn(df), steps=50000) w_w = reg.get_variable_value('linear/Weight/weight') print(f"Estimation for Weight: {w_w}") v = reg.get_variable_names() print(f"Classes: {v}") s_w = reg.get_variable_value('linear/Species/weights') b = reg.get_variable_value('linear/bias_weight') print(f"Estimation for Species: {s_w + b}")
def _maybe_add_bias_column(feature_columns, columns_to_tensors, bias_variable, targets, enable_centered_bias, columns_to_variables): train_feature_columns = list(feature_columns) # Make a copy. if enable_centered_bias: # Adding a bias column. # TODO(b/31008490): Move definition to a common constants place. bias_column_name = "tf_virtual_bias_column" if any(col.name is bias_column_name for col in feature_columns): raise ValueError("%s is a reserved column name." % bias_column_name) bias_column = layers.real_valued_column(bias_column_name) columns_to_tensors[bias_column] = array_ops.ones_like( targets, dtype=dtypes.float32) columns_to_variables[bias_column] = [bias_variable] train_feature_columns.append(bias_column) return train_feature_columns
def testClassifierWithAndWithoutKernelsNoRealValuedColumns(self): """Tests kernels have no effect for non-real valued columns .""" def input_fn(): return { 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), }, constant_op.constant([[1], [0], [1]]) price = layers.real_valued_column('price') country = layers.sparse_column_with_hash_bucket('country', hash_bucket_size=5) linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[price, country]) linear_classifier.fit(input_fn=input_fn, steps=100) linear_metrics = linear_classifier.evaluate(input_fn=input_fn, steps=1) linear_loss = linear_metrics['loss'] linear_accuracy = linear_metrics['accuracy'] kernel_mappers = { country: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[price, country], kernel_mappers=kernel_mappers) kernel_linear_classifier.fit(input_fn=input_fn, steps=100) kernel_linear_metrics = kernel_linear_classifier.evaluate( input_fn=input_fn, steps=1) kernel_linear_loss = kernel_linear_metrics['loss'] kernel_linear_accuracy = kernel_linear_metrics['accuracy'] # The kernel mapping is applied to a non-real-valued feature column and so # it should have no effect on the model. The loss and accuracy of the # "kernelized" model should match the loss and accuracy of the initial model # (without kernels). self.assertAlmostEqual(linear_loss, kernel_linear_loss, delta=0.01) self.assertAlmostEqual(linear_accuracy, kernel_linear_accuracy, delta=0.01)
def testClassifierWithAndWithoutKernelsNoRealValuedColumns(self): """Tests kernels have no effect for non-real valued columns .""" def input_fn(): return { 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), }, constant_op.constant([[1], [0], [1]]) price = layers.real_valued_column('price') country = layers.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[price, country]) linear_classifier.fit(input_fn=input_fn, steps=100) linear_metrics = linear_classifier.evaluate(input_fn=input_fn, steps=1) linear_loss = linear_metrics['loss'] linear_accuracy = linear_metrics['accuracy'] kernel_mappers = { country: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[price, country], kernel_mappers=kernel_mappers) kernel_linear_classifier.fit(input_fn=input_fn, steps=100) kernel_linear_metrics = kernel_linear_classifier.evaluate( input_fn=input_fn, steps=1) kernel_linear_loss = kernel_linear_metrics['loss'] kernel_linear_accuracy = kernel_linear_metrics['accuracy'] # The kernel mapping is applied to a non-real-valued feature column and so # it should have no effect on the model. The loss and accuracy of the # "kernelized" model should match the loss and accuracy of the initial model # (without kernels). self.assertAlmostEqual(linear_loss, kernel_linear_loss, delta=0.01) self.assertAlmostEqual(linear_accuracy, kernel_linear_accuracy, delta=0.01)
def testLinearlyInseparableBinaryDataWithAndWithoutKernels(self): """Tests classifier w/ and w/o kernels on non-linearly-separable data.""" multi_dim_feature = layers.real_valued_column('multi_dim_feature', dimension=2) # Data points are non-linearly separable so there will be at least one # mis-classified sample (accuracy < 0.8). In fact, the loss is minimized for # w1=w2=0.0, in which case each example incurs a loss of ln(2). The overall # (average) loss should then be ln(2) and the logits should be approximately # 0.0 for each sample. logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[multi_dim_feature]) logreg_classifier.fit(input_fn=_linearly_inseparable_binary_input_fn, steps=50) logreg_metrics = logreg_classifier.evaluate( input_fn=_linearly_inseparable_binary_input_fn, steps=1) logreg_loss = logreg_metrics['loss'] logreg_accuracy = logreg_metrics['accuracy'] logreg_predictions = logreg_classifier.predict( input_fn=_linearly_inseparable_binary_input_fn, as_iterable=False) self.assertAlmostEqual(logreg_loss, np.log(2), places=3) self.assertLess(logreg_accuracy, 0.8) self.assertAllClose(logreg_predictions['logits'], [[0.0], [0.0], [0.0], [0.0]]) # Using kernel mappers allows to discover non-linearities in data. Mapping # the data to a higher dimensional feature space using approx RBF kernels, # substantially reduces the loss and leads to perfect classification # accuracy. kernel_mappers = { multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernelized_logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], kernel_mappers=kernel_mappers) kernelized_logreg_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) kernelized_logreg_metrics = kernelized_logreg_classifier.evaluate( input_fn=_linearly_inseparable_binary_input_fn, steps=1) kernelized_logreg_loss = kernelized_logreg_metrics['loss'] kernelized_logreg_accuracy = kernelized_logreg_metrics['accuracy'] self.assertLess(kernelized_logreg_loss, 0.2) self.assertEqual(kernelized_logreg_accuracy, 1.0)
def DNNClassifierTrainTask(self, datasource, train_path, test_path, **kwargs): steps = kwargs.pop("steps", 2000) if datasource == 'system': # data from system training_set = load_system_dataset(train_path) if test_path: test_set = load_system_dataset(test_path) feature_columns = [real_valued_column("", dimension=4)] classifier = DNNClassifier(feature_columns=feature_columns, **kwargs # hidden_units=[10, 20, 10], # n_classes=3 ) if test_path: classifier.fit(x=training_set.data, y=training_set.target, steps=steps) accuracy_score = classifier.evaluate(x=test_set.data, y=test_set.target)["accuracy"] return accuracy_score
def testVariablesWithAndWithoutKernels(self): """Tests variables w/ and w/o kernel.""" multi_dim_feature = layers.real_valued_column('multi_dim_feature', dimension=2) linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[multi_dim_feature]) linear_classifier.fit(input_fn=_linearly_inseparable_binary_input_fn, steps=50) linear_variables = linear_classifier.get_variable_names() self.assertIn('linear/multi_dim_feature/weight', linear_variables) self.assertIn('linear/bias_weight', linear_variables) linear_weights = linear_classifier.get_variable_value( 'linear/multi_dim_feature/weight') linear_bias = linear_classifier.get_variable_value( 'linear/bias_weight') kernel_mappers = { multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], kernel_mappers=kernel_mappers) kernel_linear_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) kernel_linear_variables = kernel_linear_classifier.get_variable_names() self.assertIn('linear/multi_dim_feature_MAPPED/weight', kernel_linear_variables) self.assertIn('linear/bias_weight', kernel_linear_variables) kernel_linear_weights = kernel_linear_classifier.get_variable_value( 'linear/multi_dim_feature_MAPPED/weight') kernel_linear_bias = kernel_linear_classifier.get_variable_value( 'linear/bias_weight') # The feature column used for linear classification (no kernels) has # dimension 2 so the model will learn a 2-dimension weights vector (and a # scalar for the bias). In the kernelized model, the features are mapped to # a 30-dimensional feature space and so the weights variable will also have # dimension 30. self.assertEqual(2, len(linear_weights)) self.assertEqual(1, len(linear_bias)) self.assertEqual(30, len(kernel_linear_weights)) self.assertEqual(1, len(kernel_linear_bias))
def testLinearlyInseparableBinaryDataWithAndWithoutKernels(self): """Tests classifier w/ and w/o kernels on non-linearly-separable data.""" multi_dim_feature = layers.real_valued_column( 'multi_dim_feature', dimension=2) # Data points are non-linearly separable so there will be at least one # mis-classified sample (accuracy < 0.8). In fact, the loss is minimized for # w1=w2=0.0, in which case each example incurs a loss of ln(2). The overall # (average) loss should then be ln(2) and the logits should be approximately # 0.0 for each sample. logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[multi_dim_feature]) logreg_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) logreg_metrics = logreg_classifier.evaluate( input_fn=_linearly_inseparable_binary_input_fn, steps=1) logreg_loss = logreg_metrics['loss'] logreg_accuracy = logreg_metrics['accuracy'] logreg_predictions = logreg_classifier.predict( input_fn=_linearly_inseparable_binary_input_fn, as_iterable=False) self.assertAlmostEqual(logreg_loss, np.log(2), places=3) self.assertLess(logreg_accuracy, 0.8) self.assertAllClose(logreg_predictions['logits'], [[0.0], [0.0], [0.0], [0.0]]) # Using kernel mappers allows to discover non-linearities in data. Mapping # the data to a higher dimensional feature space using approx RBF kernels, # substantially reduces the loss and leads to perfect classification # accuracy. kernel_mappers = { multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernelized_logreg_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], kernel_mappers=kernel_mappers) kernelized_logreg_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) kernelized_logreg_metrics = kernelized_logreg_classifier.evaluate( input_fn=_linearly_inseparable_binary_input_fn, steps=1) kernelized_logreg_loss = kernelized_logreg_metrics['loss'] kernelized_logreg_accuracy = kernelized_logreg_metrics['accuracy'] self.assertLess(kernelized_logreg_loss, 0.2) self.assertEqual(kernelized_logreg_accuracy, 1.0)
def testVariablesWithAndWithoutKernels(self): """Tests variables w/ and w/o kernel.""" multi_dim_feature = layers.real_valued_column( 'multi_dim_feature', dimension=2) linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[multi_dim_feature]) linear_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) linear_variables = linear_classifier.get_variable_names() self.assertIn('linear/multi_dim_feature/weight', linear_variables) self.assertIn('linear/bias_weight', linear_variables) linear_weights = linear_classifier.get_variable_value( 'linear/multi_dim_feature/weight') linear_bias = linear_classifier.get_variable_value('linear/bias_weight') kernel_mappers = { multi_dim_feature: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')] } kernel_linear_classifier = kernel_estimators.KernelLinearClassifier( feature_columns=[], kernel_mappers=kernel_mappers) kernel_linear_classifier.fit( input_fn=_linearly_inseparable_binary_input_fn, steps=50) kernel_linear_variables = kernel_linear_classifier.get_variable_names() self.assertIn('linear/multi_dim_feature_MAPPED/weight', kernel_linear_variables) self.assertIn('linear/bias_weight', kernel_linear_variables) kernel_linear_weights = kernel_linear_classifier.get_variable_value( 'linear/multi_dim_feature_MAPPED/weight') kernel_linear_bias = kernel_linear_classifier.get_variable_value( 'linear/bias_weight') # The feature column used for linear classification (no kernels) has # dimension 2 so the model will learn a 2-dimension weights vector (and a # scalar for the bias). In the kernelized model, the features are mapped to # a 30-dimensional feature space and so the weights variable will also have # dimension 30. self.assertEqual(2, len(linear_weights)) self.assertEqual(1, len(linear_bias)) self.assertEqual(30, len(kernel_linear_weights)) self.assertEqual(1, len(kernel_linear_bias))
def test_savedmodel_state_override(self): random_model = RandomStateSpaceModel( state_dimension=5, state_noise_dimension=4, configuration=state_space_model.StateSpaceModelConfiguration( exogenous_feature_columns=[layers.real_valued_column("exogenous")], dtype=dtypes.float64, num_features=1)) estimator = estimators.StateSpaceRegressor( model=random_model, optimizer=gradient_descent.GradientDescentOptimizer(0.1)) combined_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader({ feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4], feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.], "exogenous": [-1., -2., -3., -4.] })) estimator.train(combined_input_fn, steps=1) export_location = estimator.export_savedmodel( self.get_temp_dir(), estimator.build_raw_serving_input_receiver_fn()) with ops.Graph().as_default() as graph: random_model.initialize_graph() with self.session(graph=graph) as session: variables.global_variables_initializer().run() evaled_start_state = session.run(random_model.get_start_state()) evaled_start_state = [ state_element[None, ...] for state_element in evaled_start_state] with ops.Graph().as_default() as graph: with self.session(graph=graph) as session: signatures = loader.load( session, [tag_constants.SERVING], export_location) first_split_filtering = saved_model_utils.filter_continuation( continue_from={ feature_keys.FilteringResults.STATE_TUPLE: evaled_start_state}, signatures=signatures, session=session, features={ feature_keys.FilteringFeatures.TIMES: [1, 2], feature_keys.FilteringFeatures.VALUES: [1., 2.], "exogenous": [[-1.], [-2.]]}) second_split_filtering = saved_model_utils.filter_continuation( continue_from=first_split_filtering, signatures=signatures, session=session, features={ feature_keys.FilteringFeatures.TIMES: [3, 4], feature_keys.FilteringFeatures.VALUES: [3., 4.], "exogenous": [[-3.], [-4.]] }) combined_filtering = saved_model_utils.filter_continuation( continue_from={ feature_keys.FilteringResults.STATE_TUPLE: evaled_start_state}, signatures=signatures, session=session, features={ feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4], feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.], "exogenous": [[-1.], [-2.], [-3.], [-4.]] }) split_predict = saved_model_utils.predict_continuation( continue_from=second_split_filtering, signatures=signatures, session=session, steps=1, exogenous_features={ "exogenous": [[[-5.]]]}) combined_predict = saved_model_utils.predict_continuation( continue_from=combined_filtering, signatures=signatures, session=session, steps=1, exogenous_features={ "exogenous": [[[-5.]]]}) for state_key, combined_state_value in combined_filtering.items(): if state_key == feature_keys.FilteringResults.TIMES: continue self.assertAllClose( combined_state_value, second_split_filtering[state_key]) for prediction_key, combined_value in combined_predict.items(): self.assertAllClose(combined_value, split_predict[prediction_key])
def _dnn_tree_combined_model_fn( features, labels, mode, head, dnn_hidden_units, dnn_feature_columns, tree_learner_config, num_trees, tree_examples_per_layer, config=None, dnn_optimizer="Adagrad", dnn_activation_fn=nn.relu, dnn_dropout=None, dnn_input_layer_partitioner=None, dnn_input_layer_to_tree=True, dnn_steps_to_train=10000, predict_with_tree_only=False, tree_feature_columns=None, tree_center_bias=False, dnn_to_tree_distillation_param=None, use_core_versions=False, output_type=model.ModelBuilderOutputType.MODEL_FN_OPS): """DNN and GBDT combined model_fn. Args: features: `dict` of `Tensor` objects. labels: Labels used to train on. mode: Mode we are in. (TRAIN/EVAL/INFER) head: A `Head` instance. dnn_hidden_units: List of hidden units per layer. dnn_feature_columns: An iterable containing all the feature columns used by the model's DNN. tree_learner_config: A config for the tree learner. num_trees: Number of trees to grow model to after training DNN. tree_examples_per_layer: Number of examples to accumulate before growing the tree a layer. This value has a big impact on model quality and should be set equal to the number of examples in training dataset if possible. It can also be a function that computes the number of examples based on the depth of the layer that's being built. config: `RunConfig` of the estimator. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN. If `None`, will use the Adagrad optimizer with default learning rate of 0.001. dnn_activation_fn: Activation function applied to each layer of the DNN. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability to drop out a given unit in the DNN. dnn_input_layer_partitioner: Partitioner for input layer of the DNN. Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20. dnn_input_layer_to_tree: Whether to provide the DNN's input layer as a feature to the tree. dnn_steps_to_train: Number of steps to train dnn for before switching to gbdt. predict_with_tree_only: Whether to use only the tree model output as the final prediction. tree_feature_columns: An iterable containing all the feature columns used by the model's boosted trees. If dnn_input_layer_to_tree is set to True, these features are in addition to dnn_feature_columns. tree_center_bias: Whether a separate tree should be created for first fitting the bias. dnn_to_tree_distillation_param: A Tuple of (float, loss_fn), where the float defines the weight of the distillation loss, and the loss_fn, for computing distillation loss, takes dnn_logits, tree_logits and weight tensor. If the entire tuple is None, no distillation will be applied. If only the loss_fn is None, we will take the sigmoid/softmax cross entropy loss be default. When distillation is applied, `predict_with_tree_only` will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. Returns: A `ModelFnOps` object. Raises: ValueError: if inputs are not valid. """ if not isinstance(features, dict): raise ValueError("features should be a dictionary of `Tensor`s. " "Given type: {}".format(type(features))) if not dnn_feature_columns: raise ValueError("dnn_feature_columns must be specified") if dnn_to_tree_distillation_param: if not predict_with_tree_only: logging.warning("update predict_with_tree_only to True since distillation" "is specified.") predict_with_tree_only = True # Build DNN Logits. dnn_parent_scope = "dnn" dnn_partitioner = dnn_input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=config.num_ps_replicas, min_slice_size=64 << 20)) if (output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC and not use_core_versions): raise ValueError("You must use core versions with Estimator Spec") with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=dnn_partitioner) as input_layer_scope: if use_core_versions: input_layer = feature_column_lib.input_layer( features=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope]) else: input_layer = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=input_layer_scope) previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) def _dnn_train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), name=dnn_parent_scope, variables=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope), # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) # Build Tree Logits. global_step = training_util.get_global_step() with ops.device(global_step.device): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", # Initialize an empty ensemble. name="ensemble_model") tree_features = features.copy() if dnn_input_layer_to_tree: tree_features["dnn_input_layer"] = input_layer tree_feature_columns.append(layers.real_valued_column("dnn_input_layer")) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=config.is_chief, num_ps_replicas=config.num_ps_replicas, ensemble_handle=ensemble_handle, center_bias=tree_center_bias, examples_per_layer=tree_examples_per_layer, learner_config=tree_learner_config, feature_columns=tree_feature_columns, logits_dimension=head.logits_dimension, features=tree_features, use_core_columns=use_core_versions) with ops.name_scope("gbdt"): predictions_dict = gbdt_model.predict(mode) tree_logits = predictions_dict["predictions"] def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" if dnn_to_tree_distillation_param: loss_weight, loss_fn = dnn_to_tree_distillation_param weight_tensor = head_lib._weight_tensor( # pylint: disable=protected-access features, head.weight_column_name) dnn_logits_fixed = array_ops.stop_gradient(dnn_logits) if loss_fn is None: # we create the loss_fn similar to the head loss_fn for # multi_class_head used previously as the default one. n_classes = 2 if head.logits_dimension == 1 else head.logits_dimension loss_fn = distillation_loss.create_dnn_to_tree_cross_entropy_loss_fn( n_classes) dnn_to_tree_distillation_loss = loss_weight * loss_fn( dnn_logits_fixed, tree_logits, weight_tensor) summary.scalar("dnn_to_tree_distillation_loss", dnn_to_tree_distillation_loss) loss += dnn_to_tree_distillation_loss update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op if predict_with_tree_only: if mode == model_fn.ModeKeys.TRAIN or mode == model_fn.ModeKeys.INFER: tree_train_logits = tree_logits else: tree_train_logits = control_flow_ops.cond( global_step > dnn_steps_to_train, lambda: tree_logits, lambda: dnn_logits) else: tree_train_logits = dnn_logits + tree_logits def _no_train_op_fn(loss): """Returns a no-op.""" del loss return control_flow_ops.no_op() if tree_center_bias: num_trees += 1 finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() if output_type == model.ModelBuilderOutputType.MODEL_FN_OPS: if use_core_versions: model_fn_ops = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits) dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops( dnn_train_op).train_op tree_train_op = head.create_estimator_spec( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits) tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops( tree_train_op).train_op model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops( model_fn_ops) else: model_fn_ops = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits).train_op tree_train_op = head.create_model_fn_ops( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits).train_op # Add the hooks model_fn_ops.training_hooks.extend([ trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees) ]) return model_fn_ops elif output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC: fusion_spec = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_spec = head.create_estimator_spec( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits) tree_spec = head.create_estimator_spec( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits) training_hooks = [ trainer_hooks.SwitchTrainOp(dnn_spec.train_op, dnn_steps_to_train, tree_spec.train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees) ] fusion_spec = fusion_spec._replace(training_hooks=training_hooks + list(fusion_spec.training_hooks)) return fusion_spec
from tensorflow.contrib.layers import bucketized_column, crossed_column, embedding_column, sparse_column_with_keys, sparse_column_with_hash_bucket, real_valued_column from tempfile import mkdtemp PATH_TO_DIRECTORY_OF_THIS_FILE = dirname(realpath(__file__)) PATH_TO_DIRECTORY_OF_INPUT_DATA = PATH_TO_DIRECTORY_OF_THIS_FILE + "/data/input" MODEL_DIR = PATH_TO_DIRECTORY_OF_THIS_FILE + "/classifier" CATEGORICAL_COLUMNS = ["admin_level", "country_code", "edit_distance", "has_mpoly", "has_pcode", "is_country", "is_highest_population", "is_lowest_admin_level", "matches_topic"] CONTINUOUS_COLUMNS = ["cluster_frequency", "country_rank", "median_distance", "population", "popularity"] LABEL_COLUMN = "correct" COLUMNS = sorted(CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS) + [LABEL_COLUMN] print "COLUMNS:", COLUMNS admin_level = sparse_column_with_keys(column_name="admin_level", keys=["None","0","1","2","3","4","5","6"]) # I've never seen admin 6, but you never know! cluster_frequency = real_valued_column("cluster_frequency") cluster_frequency_buckets = bucketized_column(cluster_frequency, boundaries=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]) country_code = sparse_column_with_hash_bucket("country_code", hash_bucket_size=500) country_rank = real_valued_column("country_rank") edit_distance = sparse_column_with_keys(column_name="edit_distance", keys=["0", "1", "2"]) has_pcode = sparse_column_with_keys(column_name="has_pcode", keys=["True", "False"]) has_mpoly = sparse_column_with_keys(column_name="has_mpoly", keys=["True", "False"]) is_country = sparse_column_with_keys(column_name="is_country", keys=["True", "False"]) is_lowest_admin_level = sparse_column_with_keys(column_name="is_lowest_admin_level", keys=["True", "False"]) is_highest_population = sparse_column_with_keys(column_name="is_highest_population", keys=["True", "False"]) matches_topic = sparse_column_with_keys(column_name="matches_topic", keys=["True", "False"]) median_distance = real_valued_column("median_distance") median_distance_buckets = bucketized_column(median_distance, boundaries=[10,50,100,200,300]) population = real_valued_column("population") population_buckets = bucketized_column(population, boundaries=[0, 1, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]) popularity = real_valued_column("popularity")
def build_feature_cols(): # Sparse base columns. gender = tf.contrib.layers.sparse_column_with_keys( column_name="gender", keys=["female", "male"]) race = tf.contrib.layers.sparse_column_with_keys( column_name="race", keys=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"]) education = tf.contrib.layers.sparse_column_with_hash_bucket( "education", hash_bucket_size=1000) marital_status = tf.contrib.layers.sparse_column_with_hash_bucket( "marital_status", hash_bucket_size=100) relationship = tf.contrib.layers.sparse_column_with_hash_bucket( "relationship", hash_bucket_size=100) workclass = tf.contrib.layers.sparse_column_with_hash_bucket( "workclass", hash_bucket_size=100) occupation = tf.contrib.layers.sparse_column_with_hash_bucket( "occupation", hash_bucket_size=1000) native_country = tf.contrib.layers.sparse_column_with_hash_bucket( "native_country", hash_bucket_size=1000) # Continuous base columns. age = real_valued_column("age") education_num = real_valued_column("education_num") capital_gain = real_valued_column("capital_gain") capital_loss = real_valued_column("capital_loss") hours_per_week = real_valued_column("hours_per_week") # Transformations. age_buckets = tf.contrib.layers.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) education_occupation = tf.contrib.layers.crossed_column( [education, occupation], hash_bucket_size=int(1e4)) age_race_occupation = tf.contrib.layers.crossed_column( [age_buckets, race, occupation], hash_bucket_size=int(1e6)) country_occupation = tf.contrib.layers.crossed_column( [native_country, occupation], hash_bucket_size=int(1e4)) # Wide columns and deep columns. wide_columns = [gender, native_country, education, occupation, workclass, race, marital_status, relationship, age_buckets, education_occupation, age_race_occupation, country_occupation] deep_columns = [ embedding_column(gender, dimension=8), embedding_column(native_country, dimension=8), embedding_column(education, dimension=8), embedding_column(occupation, dimension=8), embedding_column(workclass, dimension=8), embedding_column(race, dimension=8), embedding_column(marital_status, dimension=8), embedding_column(relationship, dimension=8), embedding_column(age_buckets, dimension=8), embedding_column(education_occupation, dimension=8), embedding_column(age_race_occupation, dimension=8), embedding_column(country_occupation, dimension=8), age, education_num, capital_gain, capital_loss, hours_per_week, ] return wide_columns, deep_columns
tf.logging.set_verbosity(tf.logging.INFO) CSV_COLUMNS = 'fare_amount,dayofweek,hourofday,pickuplon,pickuplat,dropofflon,dropofflat,passengers,key'.split(',') SCALE_COLUMNS = ['pickuplon','pickuplat','dropofflon','dropofflat','passengers'] LABEL_COLUMN = 'fare_amount' KEY_FEATURE_COLUMN = 'key' DEFAULTS = [[0.0], ['Sun'], [0], [-74.0], [40.0], [-74.0], [40.7], [1.0], ['nokey']] # These are the raw input columns, and will be provided for prediction also INPUT_COLUMNS = [ # define features layers.sparse_column_with_keys('dayofweek', keys=['Sun', 'Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat']), layers.sparse_column_with_integerized_feature('hourofday', bucket_size=24), # engineered features that are created in the input_fn layers.real_valued_column('latdiff'), layers.real_valued_column('londiff'), layers.real_valued_column('euclidean'), # real_valued_column layers.real_valued_column('pickuplon'), layers.real_valued_column('pickuplat'), layers.real_valued_column('dropofflat'), layers.real_valued_column('dropofflon'), layers.real_valued_column('passengers'), ] def build_estimator(model_dir, nbuckets, hidden_units): """ Build an estimator starting from INPUT COLUMNS. These include feature transformations and synthetic features.
X_train = X_train.copy() X_test = X_test.copy() categorical_var_encoders = {} for var in categorical_vars: le = LabelEncoder().fit(X_train[var]) X_train[var + '_ids'] = le.transform(X_train[var]) X_test[var + '_ids'] = le.transform(X_test[var]) X_train.pop(var) X_test.pop(var) categorical_var_encoders[var] = le ### Note: Feature Columns currently (2016/10/22) not working, update is coming. # Setup feature columns. CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable. feature_columns = [ layers.real_valued_column(var) for var in continues_vars ] + [ layers.embedding_column( layers.sparse_column_with_integerized_feature( var + '_ids', len(categorical_var_encoders[var].classes_)), CATEGORICAL_EMBED_SIZE) for var in categorical_vars ] # Linear classifier. ''' random.seed(42) tflr = learn.LinearClassifier(n_classes=2, feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
def _dnn_tree_combined_model_fn( features, labels, mode, head, dnn_hidden_units, dnn_feature_columns, tree_learner_config, num_trees, tree_examples_per_layer, config=None, dnn_optimizer="Adagrad", dnn_activation_fn=nn.relu, dnn_dropout=None, dnn_input_layer_partitioner=None, dnn_input_layer_to_tree=True, dnn_steps_to_train=10000, tree_feature_columns=None, tree_center_bias=True): """DNN and GBDT combined model_fn. Args: features: `dict` of `Tensor` objects. labels: Labels used to train on. mode: Mode we are in. (TRAIN/EVAL/INFER) head: A `Head` instance. dnn_hidden_units: List of hidden units per layer. dnn_feature_columns: An iterable containing all the feature columns used by the model's DNN. tree_learner_config: A config for the tree learner. num_trees: Number of trees to grow model to after training DNN. tree_examples_per_layer: Number of examples to accumulate before growing the tree a layer. This value has a big impact on model quality and should be set equal to the number of examples in training dataset if possible. It can also be a function that computes the number of examples based on the depth of the layer that's being built. config: `RunConfig` of the estimator. dnn_optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training the DNN. If `None`, will use the Adagrad optimizer with default learning rate of 0.001. dnn_activation_fn: Activation function applied to each layer of the DNN. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability to drop out a given unit in the DNN. dnn_input_layer_partitioner: Partitioner for input layer of the DNN. Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20. dnn_input_layer_to_tree: Whether to provide the DNN's input layer as a feature to the tree. dnn_steps_to_train: Number of steps to train dnn for before switching to gbdt. tree_feature_columns: An iterable containing all the feature columns used by the model's boosted trees. If dnn_input_layer_to_tree is set to True, these features are in addition to dnn_feature_columns. tree_center_bias: Whether a separate tree should be created for first fitting the bias. Returns: A `ModelFnOps` object. Raises: ValueError: if inputs are not valid. """ if not isinstance(features, dict): raise ValueError("features should be a dictionary of `Tensor`s. " "Given type: {}".format(type(features))) if not dnn_feature_columns: raise ValueError("dnn_feature_columns must be specified") # Build DNN Logits. dnn_parent_scope = "dnn" dnn_partitioner = dnn_input_layer_partitioner or ( partitioned_variables.min_max_variable_partitioner( max_partitions=config.num_ps_replicas, min_slice_size=64 << 20)) with variable_scope.variable_scope( dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): with variable_scope.variable_scope( "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=dnn_partitioner) as input_layer_scope: input_layer = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, weight_collections=[dnn_parent_scope], scope=input_layer_scope) previous_layer = input_layer for layer_id, num_hidden_units in enumerate(dnn_hidden_units): with variable_scope.variable_scope( "hiddenlayer_%d" % layer_id, values=(previous_layer,)) as hidden_layer_scope: net = layers.fully_connected( previous_layer, num_hidden_units, activation_fn=dnn_activation_fn, variables_collections=[dnn_parent_scope], scope=hidden_layer_scope) if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout)) _add_hidden_layer_summary(net, hidden_layer_scope.name) previous_layer = net with variable_scope.variable_scope( "logits", values=(previous_layer,)) as logits_scope: dnn_logits = layers.fully_connected( previous_layer, head.logits_dimension, activation_fn=None, variables_collections=[dnn_parent_scope], scope=logits_scope) _add_hidden_layer_summary(dnn_logits, logits_scope.name) def _dnn_train_op_fn(loss): """Returns the op to optimize the loss.""" return optimizers.optimize_loss( loss=loss, global_step=training_util.get_global_step(), learning_rate=_DNN_LEARNING_RATE, optimizer=_get_optimizer(dnn_optimizer), name=dnn_parent_scope, variables=ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope), # Empty summaries to prevent optimizers from logging training_loss. summaries=[]) # Build Tree Logits. global_step = training_util.get_global_step() with ops.device(global_step.device): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", # Initialize an empty ensemble. name="ensemble_model") tree_features = features.copy() if dnn_input_layer_to_tree: tree_features["dnn_input_layer"] = input_layer tree_feature_columns.append(layers.real_valued_column("dnn_input_layer")) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=config.is_chief, num_ps_replicas=config.num_ps_replicas, ensemble_handle=ensemble_handle, center_bias=tree_center_bias, examples_per_layer=tree_examples_per_layer, learner_config=tree_learner_config, feature_columns=tree_feature_columns, logits_dimension=head.logits_dimension, features=tree_features) with ops.name_scope("gbdt"): predictions_dict = gbdt_model.predict(mode) tree_logits = predictions_dict["predictions"] def _tree_train_op_fn(loss): """Returns the op to optimize the loss.""" update_op = gbdt_model.train(loss, predictions_dict, labels) with ops.control_dependencies( [update_op]), (ops.colocate_with(global_step)): update_op = state_ops.assign_add(global_step, 1).op return update_op tree_train_logits = dnn_logits + tree_logits def _no_train_op_fn(loss): """Returns a no-op.""" del loss return control_flow_ops.no_op() model_fn_ops = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_no_train_op_fn, logits=tree_train_logits) dnn_train_op = head.create_model_fn_ops( features=features, mode=mode, labels=labels, train_op_fn=_dnn_train_op_fn, logits=dnn_logits).train_op tree_train_op = head.create_model_fn_ops( features=tree_features, mode=mode, labels=labels, train_op_fn=_tree_train_op_fn, logits=tree_train_logits).train_op if tree_center_bias: num_trees += 1 finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() model_fn_ops.training_hooks.extend([ trainer_hooks.SwitchTrainOp( dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees( num_trees, attempted_trees, finalized_trees)]) return model_fn_ops