# 첫 번째 모델 만들기 # 이 연습에서는 median_house_value를 예측하려고합니다. 이 값이 우리의 레이블 (때로는 대상이라고도 함)이 됩니다. total_rooms를 입력 특징으로 사용하겠습니다. # 이 데이터는 도시 블록 수준에 해당하므로, 이 특징은 해당 블록의 총 객실 수 또는 해당 블록에 거주하는 총 사용자 수를 각각 반영합니다. # 모델을 학습하기 위해 TensorFlow contrib.learn 라이브러리에서 제공하는 LinearRegressor 인터페이스를 사용합니다. 이 라이브러리는 많은 입출력 파이프라인과 사용 가능하며, 데이터, 훈련 및 평가 과정을 편리하게 할 수 있는 인터페이스를 제공합니다. # 먼저 입력 특징(feature) 및, 대상을 정의하고 LinearRegressor 객체를 만듭니다. # GradientDescentOptimizer는 미니배치 확률경사하강법 (Mini-Batch Stochastic Gradient Descent,SGD)를 구현합니다. 여기에서 mini-batch의 크기는 batch_size 매개 변수로 지정됩니다. optimizer의 learning_rate 매개 변수에 유의하십시오.이 매개 변수는 그라디언트 단계의 크기를 제어합니다. 안전을 위해 gradient_clip_norm 값도 포함합니다. 이것은 그라디언트가 너무 커서 그라데이션 강하에서 나쁜 결과가 나오는 경우를 피하도록 도와줍니다. my_feature = california_housing_dataframe[["total_rooms"]] targets = california_housing_dataframe["median_house_value"] training_input_fn = learn_io.pandas_input_fn( x=my_feature, y=targets, num_epochs=None, batch_size=1) feature_columns = [tf.contrib.layers.real_valued_column("total_rooms", dimension=1)] linear_regressor = tf.contrib.learn.LinearRegressor( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.00001), gradient_clip_norm=5.0, ) # 특징 열과 대상 (레이블)에 fit()을 호출하면 모델을 학습하게 됩니다. _ = linear_regressor.fit( input_fn=training_input_fn, steps=100 )
def train_model(learning_rate, steps, batch_size, input_feature="total_rooms"): """하나의 특징에 대한 선형 회귀 모델을 훈련합니다. Args: learning_rate: `float`, 학습율. steps: 0이 아닌 `int`, 총 훈련 단계 수. 훈련 단계는 단일 배치를 사용하며, forward 및 backward 패스로 구성됩니다. batch_size: 0이 아닌 `int`, 배치 크기. input_feature: 입력 특징으로 사용하기 위한 `california_housing_dataframe`의 지정한 열. `string` """ periods = 10 steps_per_period = steps / periods # total_rooms 의 데이터를 이용하여 median_house_value 를 예측하는 모델을 만들겠습니다. # 이를 위해 다음과 같이 파이프라인을 정의합니다. my_feature = input_feature my_feature_column = california_housing_dataframe[[my_feature]] my_label = "median_house_value" targets = california_housing_dataframe[my_label] # 특징 열 만들기 feature_columns = [tf.contrib.layers.real_valued_column(my_feature, dimension=1)] # 입력 함수들 만들기 # 특징 열과 대상 (레이블)에 fit()을 호출하면 모델을 학습하게 됩니다. # 학습 데이터는 learn_io.pandas_input_fn 을 이용하여 미니배치 형태로 구성됩니다. training_input_fn = learn_io.pandas_input_fn( x=my_feature_column, y=targets, num_epochs=None, batch_size=batch_size) prediction_input_fn = learn_io.pandas_input_fn( x=my_feature_column, y=targets, num_epochs=1, shuffle=False) # 선형 회귀 오브젝트 만들기 linear_regressor = tf.contrib.learn.LinearRegressor( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), gradient_clip_norm=5.0 ) # 각 주기별로 모델의 상태를 플롯하기 위해 준비 plt.clf() plt.close() plt.figure(figsize=(15, 6)) plt.subplot(1, 2, 1) plt.title("Learned Line by Period") plt.ylabel(my_label) plt.xlabel(my_feature) sample = california_housing_dataframe.sample(n=300) plt.scatter(sample[my_feature], sample[my_label]) colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)] # 모델을 훈련 시키되 루프 내부에서 수행하여 손실 매트릭을 주기적으로 평가할 수 있습니다. print("Training model...") print("RMSE (on training data):") root_mean_squared_errors = [] for period in range (0, periods): # 이전 상태에서 시작하여 모델을 교육. linear_regressor.fit( input_fn=training_input_fn, steps=steps_per_period ) # 잠시 멈추고 예측을 계산합니다. predictions = list(linear_regressor.predict( input_fn=prediction_input_fn)) # 손실 계산. root_mean_squared_error = math.sqrt( metrics.mean_squared_error(predictions, targets)) # 주기적으로 현재의 손실을 출력. print(" period %02d : %0.2f" % (period, root_mean_squared_error)) # 이번 주기의 손실 매트릭을 리스트에 추가. root_mean_squared_errors.append(root_mean_squared_error) # 마지막으로 시간 경과에 따라 가중치와 편향을 추적합니다. # 몇 가지 수학을 적용하여 데이터와 선이 깔끔하게 정리되도록 합니다. y_extents = np.array([0, sample[my_label].max()]) x_extents = (y_extents - linear_regressor.bias_) / linear_regressor.weights_[0] x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), sample[my_feature].min()) y_extents = linear_regressor.weights_[0] * x_extents + linear_regressor.bias_ plt.plot(x_extents, y_extents, color=colors[period]) print("Model training finished.") # 주기에 따른 손실 매트릭 그래프 출력 plt.subplot(1, 2, 2) plt.ylabel('RMSE') plt.xlabel('Periods') plt.title("Root Mean Squared Error vs. Periods") plt.plot(root_mean_squared_errors) plt.show() # 보정 데이터가 있는 표를 출력합니다. calibration_data = pd.DataFrame() calibration_data["predictions"] = pd.Series(predictions) calibration_data["targets"] = pd.Series(targets) display(calibration_data.describe()) print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
def train_nn_regression_model(optimizer, steps, batch_size, hidden_units, training_examples, training_targets, validation_examples, validation_targets): """ In addition to training, this function also prints training progress information, a plot of the training and validation loss over time, as well as a confusion matrix. Args: learning_rate: An `int`, the learning rate to use. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. hidden_units: A `list` of int values, specifying the number of neurons in each layer. training_examples: A `DataFrame` containing the training features. training_targets: A `DataFrame` containing the training labels. validation_examples: A `DataFrame` containing the validation features. validation_targets: A `DataFrame` containing the validation labels. Returns: A tuple `(estimator, training_losses, validation_losses): estimator: the trained `DNNRegressor` object. training_losses: a `list` containing the training loss values taken during training. validation_losses: a `list` containing the validation loss values taken during training. """ periods = 10 steps_per_period = steps / periods # Create the input functions. feature_columns = set([ tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples ]) training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=None, batch_size=batch_size) predict_training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=1, shuffle=False) predict_validation_input_fn = learn_io.pandas_input_fn( x=validation_examples, y=validation_targets, num_epochs=1, shuffle=False) # Create a linear regressor object. dnn_regressor = tf.contrib.learn.DNNRegressor( feature_columns=feature_columns, hidden_units=hidden_units, optimizer=optimizer, gradient_clip_norm=5.0) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print "Training model..." print "RMSE (on training data):" training_rmse = [] validation_rmse = [] for period in range(0, periods): # Train the model, starting from the prior state. dnn_regressor.fit(input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. training_predictions = list( dnn_regressor.predict(input_fn=predict_training_input_fn)) validation_predictions = list( dnn_regressor.predict(input_fn=predict_validation_input_fn)) # Compute training and validation loss. training_root_mean_squared_error = math.sqrt( metrics.mean_squared_error(training_predictions, training_targets)) validation_root_mean_squared_error = math.sqrt( metrics.mean_squared_error(validation_predictions, validation_targets)) # Occasionally print the current loss. print " period %02d : %0.2f" % (period, training_root_mean_squared_error) # Add the loss metrics from this period to our list. training_rmse.append(training_root_mean_squared_error) validation_rmse.append(validation_root_mean_squared_error) print "Model training finished." # Output a graph of loss metrics over periods. plt.ylabel("RMSE") plt.xlabel("Periods") plt.title("Root Mean Squared Error vs. Periods") plt.tight_layout() plt.plot(training_rmse, label="training") plt.plot(validation_rmse, label="validation") plt.legend() print "Final RMSE (on training data): %0.2f" % training_root_mean_squared_error print "Final RMSE (on validation data): %0.2f" % validation_root_mean_squared_error return dnn_regressor, training_rmse, validation_rmse
def train_linear_regressor_model(learning_rate, steps, batch_size, training_examples, training_targets, validation_examples, validation_targets): """선형 회귀 모델을 훈련합니다. 이 함수는 훈련뿐만 아니라 시간 경과에 따른 훈련 및 검증 손실의 플롯 등 훈련 진행 정보도 보여줍니다. Args: learning_rate: `float`, 학습율. steps: 0이 아닌 `int`, 총 훈련 단계 수. 훈련 단계는 단일 배치를 사용하는 전진 및 역진 통과(forward/backward pass)로 구성됩니다. batch_size: 0이 아닌 `int`, 배치 크기. training_examples: 훈련을 위한 입력 특징으로 사용할 `california_housing_dataframe` 내의 하나 또는 여러개의 열이 든 `DataFrame` training_targets: 훈련을 위한 목표로 사용할 `california_housing_dataframe` 내의 하나의 열이 든 `DataFrame` validation_examples: 검증을 위한 입력 특징으로 사용할 `california_housing_dataframe` 내의 하나 또는 여러개의 열이 든 `DataFrame` validation_targets: 검증을 위한 목표로 사용할 `california_housing_dataframe` 내의 하나의 열이 든 `DataFrame` Returns: 훈련 데이터로 훈련한 `LinearRegressor` 객체. """ periods = 10 steps_per_period = steps / periods # 선형 회귀 객체 생성. feature_columns = set([ tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples ]) linear_regressor = tf.contrib.learn.LinearRegressor( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer( learning_rate=learning_rate), gradient_clip_norm=5.0) # 입력 함수 만들기 training_input_fn = learn_io.pandas_input_fn( x=training_examples, y=training_targets["median_house_value_is_high"], num_epochs=None, batch_size=batch_size) predict_training_input_fn = learn_io.pandas_input_fn( x=training_examples, y=training_targets["median_house_value_is_high"], num_epochs=1, shuffle=False) predict_validation_input_fn = learn_io.pandas_input_fn( x=validation_examples, y=validation_targets["median_house_value_is_high"], num_epochs=1, shuffle=False) # 모델을 훈련 시키되 루프 내부에서 수행하여 손실 매트릭을 주기적으로 평가할 수 있게 합니다. print("Training model...") print("RMSE (on training data):") training_rmse = [] validation_rmse = [] for period in range(0, periods): # 이전 상태에서 시작하여 모델을 교육. linear_regressor.fit(input_fn=training_input_fn, steps=steps_per_period) # 잠시 멈추고 예측을 계산합니다. training_predictions = list( linear_regressor.predict(input_fn=predict_training_input_fn)) validation_predictions = list( linear_regressor.predict(input_fn=predict_validation_input_fn)) # 훈련 및 검증 손실 계산. training_root_mean_squared_error = math.sqrt( metrics.mean_squared_error(training_predictions, training_targets)) validation_root_mean_squared_error = math.sqrt( metrics.mean_squared_error(validation_predictions, validation_targets)) # 주기적으로 현재의 손실을 출력. print(" period %02d : %0.2f" % (period, training_root_mean_squared_error)) # 이번 주기의 손실 매트릭을 리스트에 추가. training_rmse.append(training_root_mean_squared_error) validation_rmse.append(validation_root_mean_squared_error) print("Model training finished.") # 주기에 따른 손실 매트릭 그래프 출력 plt.ylabel("RMSE") plt.xlabel("Periods") plt.title("Root Mean Squared Error vs. Periods") plt.tight_layout(True) plt.plot(training_rmse, label="training") plt.plot(validation_rmse, label="validation") plt.legend() plt.show() return linear_regressor
def train_linear_classifier_model(learning_rate, steps, batch_size, training_examples, training_targets, validation_examples, validation_targets): """Trains a linear regression model of one feature. In addition to training, this function also prints training progress information, as well as a plot of the training and validation loss over time. Args: learning_rate: A `float`, the learning rate. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. training_examples: A `DataFrame` containing one or more columns from `dataframe` to use as input features for training. training_targets: A `DataFrame` containing exactly one column from `dataframe` to use as target for training. validation_examples: A `DataFrame` containing one or more columns from `dataframe` to use as input features for validation. validation_targets: A `DataFrame` containing exactly one column from `dataframe` to use as target for validation. Returns: A `LinearClassifier` object trained on the training data. """ periods = 10 steps_per_period = steps / periods # Create a linear classifier object. feature_columns = set([ tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples ]) linear_classifier = tf.contrib.learn.LinearClassifier( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer( learning_rate=learning_rate)) # Create input functions training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=None, batch_size=batch_size) predict_training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=1, shuffle=False) predict_validation_input_fn = learn_io.pandas_input_fn( x=validation_examples, y=validation_targets, num_epochs=1, shuffle=False) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print "Training model..." print "Log loss (on training data):" training_errors = [] validation_errors = [] for period in range(0, periods): # Train the model, starting from the prior state. linear_classifier.fit(input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. training_probabilities = np.array( list( linear_classifier.predict_proba( input_fn=predict_training_input_fn))) validation_probabilities = np.array( list( linear_classifier.predict_proba( input_fn=predict_validation_input_fn))) # Compute training and validation loss. training_log_loss = metrics.log_loss(training_targets, training_probabilities[:, 1]) validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities[:, 1]) training_roc = metrics.roc_auc_score(training_targets, training_probabilities[:, 1]) validation_roc = metrics.roc_auc_score(validation_targets, validation_probabilities[:, 1]) # Occasionally print the current loss. print " period %02d : %0.2f" % (period, training_log_loss) # Add the loss metrics from this period to our list. training_errors.append(training_log_loss) validation_errors.append(validation_log_loss) print "Model training finished." # Output a graph of loss metrics over periods. plt.ylabel("LogLoss") plt.xlabel("Periods") plt.title("LogLoss vs. Periods") plt.tight_layout() plt.plot(training_errors, label="training") plt.plot(validation_errors, label="validation") plt.legend() return linear_classifier
def train_nn_classification_model(learning_rate, steps, batch_size, hidden_units, training_examples, training_targets, validation_examples, validation_targets): """ In addition to training, this function also prints training progress information, a plot of the training and validation loss over time, as well as a confusion matrix. Args: learning_rate: An `int`, the learning rate to use. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. hidden_units: A `list` of int values, specifying the number of neurons in each layer. training_examples: A `DataFrame` containing the training features. training_targets: A `DataFrame` containing the training labels. validation_examples: A `DataFrame` containing the validation features. validation_targets: A `DataFrame` containing the validation labels. Returns: The trained `DNNClassifier` object. """ periods = 10 steps_per_period = steps / periods # Create the input functions. feature_columns = set([ tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples ]) training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=None, batch_size=batch_size) predict_training_input_fn = learn_io.pandas_input_fn(x=training_examples, y=training_targets, num_epochs=1, shuffle=False) predict_validation_input_fn = learn_io.pandas_input_fn( x=validation_examples, y=validation_targets, num_epochs=1, shuffle=False) # Create a linear classifier object. classifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, n_classes=2, hidden_units=hidden_units, optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate), gradient_clip_norm=5.0, config=tf.contrib.learn.RunConfig(keep_checkpoint_max=1)) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print "Training model..." print "Per period ROC (on training data):" #training_errors = [] #validation_errors = [] training_loglossvals = [] validation_loglossvals = [] training_rocvals = [] validation_rocvals = [] #training_precisionvals = [] #validation_precisionvals = [] #training_recallvals = [] #validation_recallvals = [] #thresh=0.7 for period in range(0, periods): # Train the model, starting from the prior state. classifier.fit(input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. training_predictions = np.array( list(classifier.predict_proba(input_fn=predict_training_input_fn))) validation_predictions = np.array( list(classifier.predict_proba( input_fn=predict_validation_input_fn))) # Compute training and validation errors. training_log_loss = metrics.log_loss(training_targets, training_predictions[:, 1]) validation_log_loss = metrics.log_loss(validation_targets, validation_predictions[:, 1]) training_roc = metrics.roc_auc_score(training_targets, training_predictions[:, 1]) validation_roc = metrics.roc_auc_score(validation_targets, validation_predictions[:, 1]) #training_precision = metrics.precision_score(training_targets, (training_predictions[:, 1]>thresh).astype(int)) #validation_precision = metrics.precision_score(validation_targets, validation_predictions[:, 1]) #training_recall = metrics.recall_score(training_targets, training_predictions[:, 1]) #validation_recall = metrics.recall_score(validation_targets, validation_predictions[:, 1]) # Occasionally print the current loss. print " period %02d : %0.2f" % (period, training_roc) # Add the loss metrics from this period to our list. training_rocvals.append(training_roc) validation_rocvals.append(validation_roc) training_loglossvals.append(training_log_loss) validation_loglossvals.append(validation_log_loss) #training_precisionvals.append(training_precision) #validation_precisionvals.append(validation_precision) #training_recallvals.append(training_recall) #validation_recallvals.append(validation_recall) print "LogLoss error (on training data):" print training_loglossvals print "LogLoss error (on validation data):" print validation_loglossvals print "ROC (on training data):" print training_rocvals print "ROC (on validation data):" print validation_rocvals #print "Precision (on training data):" #print training_precisionvals #print "Precision (on validation data):" #print validation_precisionvals #print "Recall (on training data):" #print training_recallvals #print "Recall (on validation data):" #print validation_recallvals print "Model training finished." return classifier
def train_model(learning_rate, steps, batch_size, input_feature): """선형 회귀 모델을 훈련합니다. Args: learning_rate: `float`, 학습율. steps: 0이 아닌 `int`, 총 훈련 단계 수. 훈련 단계는 단일 배치를 사용하는 전진 및 역진 통과(forward/backward pass)로 구성됩니다. batch_size: 0이 아닌 `int`, 배치 크기. input_feature: 입력 특징으로 쓰기 위하여 `california_housing_dataframe`에서 지정한 열 이름 `string`. Returns: 모델 훈련 후 목표 및 그에 해당하는 예측을 담은 Pandas `DataFrame`. """ periods = 10 steps_per_period = steps / periods my_feature = input_feature my_feature_column = california_housing_dataframe[[my_feature]].astype('float32') my_label = "median_house_value" targets = california_housing_dataframe[my_label].astype('float32') # 입력 함수들 만들기 training_input_fn = learn_io.pandas_input_fn( x=my_feature_column, y=targets, num_epochs=None, batch_size=batch_size) predict_training_input_fn = learn_io.pandas_input_fn( x=my_feature_column, y=targets, num_epochs=1, shuffle=False) # 선형 회귀 객체 만들기 feature_columns = [tf.contrib.layers.real_valued_column(my_feature)] linear_regressor = tf.contrib.learn.LinearRegressor( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate), gradient_clip_norm=5.0 ) # 각 주기별로 모델의 상태를 플롯하기 위해 준비 plt.figure(figsize=(15, 6)) plt.subplot(1, 2, 1) plt.title("Learned Line by Period") plt.ylabel(my_label) plt.xlabel(my_feature) sample = california_housing_dataframe.sample(n=300) plt.scatter(sample[my_feature], sample[my_label]) colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)] # 모델을 훈련 시키되 루프 내부에서 수행하여 손실 매트릭을 주기적으로 평가할 수 있습니다. print("Training model...") print("RMSE (on training data):") root_mean_squared_errors = [] for period in range (0, periods): # 이전 상태에서 시작하여 모델을 교육. linear_regressor.fit( input_fn=training_input_fn, steps=steps_per_period, ) # 잠시 멈추고 예측을 계산합니다. predictions = list(linear_regressor.predict(input_fn=predict_training_input_fn)) # 손실 계산. root_mean_squared_error = math.sqrt( metrics.mean_squared_error(predictions, targets)) # 주기적으로 현재의 손실을 출력. print(" period %02d : %0.2f" % (period, root_mean_squared_error)) # 이번 주기의 손실 매트릭을 리스트에 추가. root_mean_squared_errors.append(root_mean_squared_error) # 마지막으로 시간 경과에 따라 가중치와 편향을 추적합니다. # 몇 가지 수학을 적용하여 데이터와 선이 깔끔하게 정리되도록 합니다. y_extents = np.array([0, sample[my_label].max()]) x_extents = (y_extents - linear_regressor.bias_) / linear_regressor.weights_[0] x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), sample[my_feature].min()) y_extents = linear_regressor.weights_[0] * x_extents + linear_regressor.bias_ plt.plot(x_extents, y_extents, color=colors[period]) print("Model training finished.") # 주기에 따른 손실 매트릭 그래프 출력 plt.subplot(1, 2, 2) plt.ylabel('RMSE') plt.xlabel('Periods') plt.title("Root Mean Squared Error vs. Periods") plt.tight_layout() plt.plot(root_mean_squared_errors) # 보정 데이터가 있는 표를 출력합니다. calibration_data = pd.DataFrame() calibration_data["predictions"] = pd.Series(predictions) calibration_data["targets"] = pd.Series(targets) display(calibration_data.describe()) print("Final RMSE (on training data): %0.2f" % root_mean_squared_error) return calibration_data