def main(): training_data = pd.read_csv('../data/20180105_label.csv', skipinitialspace=True, engine='python', dtype=np.float64, iterator=True, ) test_data = pd.read_csv('../data/20180107_label.csv', skipinitialspace=True, engine='python', dtype=np.float64, iterator=True, ) deep_columns = create_columns(CONTINUOUS_COLUMNS) model = DNNClassifier(feature_columns=deep_columns, model_dir='./model', hidden_units=[10, 10], n_classes=2, input_layer_min_slice_size=10000) tf.logging.set_verbosity(tf.logging.INFO) training_data_chunk = training_data.get_chunk(1000000000) model.fit(input_fn=lambda: input_fn(training_data_chunk), steps=100) tf.logging.info("end fit model") test_data_chunk = test_data.get_chunk(10000) accuracy = model.evaluate(input_fn=lambda: input_fn(test_data_chunk), steps=100)['accuracy'] print(accuracy * 100)
def main(_): mnist = input_data.read_data_sets("/tmp/data") X_train = mnist.train.images X_test = mnist.test.images Y_train = mnist.train.labels.astype("int") Y_test = mnist.test.labels.astype("int") config = RunConfig(tf_random_seed=42, save_checkpoints_secs=10) feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input( X_train) validation_monitor = monitors.ValidationMonitor(x=X_test, y=Y_test, every_n_steps=100) dnn_clf = DNNClassifier( hidden_units=[300, 100], n_classes=10, feature_columns=feature_cols, config=config, model_dir="/home/mtb/Projects/machine_learning/tensorflow/mnist") dnn_clf.fit(X_train, Y_train, batch_size=50, steps=4000, monitors=[validation_monitor]) accuracy_score = dnn_clf.evaluate(x=X_test, y=Y_test)["accuracy"] print(' accuracy_score: {0} '.format(accuracy_score))
def __init__(self, n_classes, type="w2v", hidden_units=[10, 20, 10], num_features=100, context=10, method=1): #if type=="w2v": #self.model = w2v_helpers.get_word2vec(num_features, context, method) self.type = type self.classifier = DNNClassifier(hidden_units=hidden_units, n_classes=n_classes)
def test_dnn_classifier(self): """ test converting DNNClassifier model """ algorithm_name = "dnn_classifier" model_output = os.path.join(self.data.base_path, "{}".format(algorithm_name)) classifier = DNNClassifier(hidden_units=[4 * 3, 2 * 3], feature_columns=self._iris_dnn_features(), n_classes=3, optimizer=tf.train.AdamOptimizer, config=self.estimator_conf) self._generate_tf_model(classifier, model_output) self.assertTrue(os.path.exists(model_output)) pmml_output = os.path.join(self.data.base_path, "{}.pmml".format(algorithm_name)) ppmml.to_pmml(model_input=model_output, pmml_output=pmml_output, model_type='tensorflow') self.assertTrue(os.path.exists(pmml_output)) # validate pmml file data_output = os.path.join(self.data.base_path, "{}.csv".format(algorithm_name)) ppmml.predict(pmml_output, self.data.test_data_input, data_output) self.assertTrue(os.path.exists(data_output))
def main(): iternumber = 2 with open('MLPtrained_dead.pickle','rb') as f: deadcheck = pickle.load(f) if os.path.isfile('run_times_{}.npy'.format(iternumber -1)): print('run times exists. loading data') run_times = list(np.load('run_times_{}.npy '.format(iternumber-1))) else: run_times = [] feature_columns = [tf.contrib.layers.real_valued_column("x", dimension=9600)] estimator = SKCompat(DNNClassifier(feature_columns=feature_columns, hidden_units=[256,64], model_dir='./model/')) for i in range(4,0,-1): print(i) time.sleep(1) # saver = tf.train.import_meta_graph('canabalt nn 200 50.meta') while True: run_data, run_time = play(estimator, deadcheck) run_times.append(run_time) np.save('run_times_{}.npy'.format(iternumber-1), run_times)
def DNNClassifierTrainTask(self, datasource, train_path, test_path, **kwargs): steps = kwargs.pop("steps", 2000) if datasource == 'system': # data from system training_set = load_system_dataset(train_path) if test_path: test_set = load_system_dataset(test_path) feature_columns = [real_valued_column("", dimension=4)] classifier = DNNClassifier(feature_columns=feature_columns, **kwargs # hidden_units=[10, 20, 10], # n_classes=3 ) if test_path: classifier.fit(x=training_set.data, y=training_set.target, steps=steps) accuracy_score = classifier.evaluate(x=test_set.data, y=test_set.target)["accuracy"] return accuracy_score
class DNN(BaseEstimator, ClassifierMixin): def __init__(self, n_classes, type="w2v", hidden_units=[10, 20, 10], num_features=100, context=10, method=1): #if type=="w2v": #self.model = w2v_helpers.get_word2vec(num_features, context, method) self.type = type self.classifier = DNNClassifier(hidden_units=hidden_units, n_classes=n_classes) def pre_transformX(self, df, colnames, df_test=None, n_gram=None): data = None if self.type == "w2v": data = features_helpers.create_sentences(df, colnames) data = features_helpers.transform_to_w2v_sentences( data, self.model) return data.as_matrix() else: x_train, x_test = features_helpers.transform_to_bow( df, df_test, colnames, n_gram) return x_train, x_test def pre_transformY(self, df, list_dict): y = map(lambda w: list_dict.index(w), list(df)) return np.array(y) def fit(self, X, y=None): self.classifier.fit(x=X, y=y, steps=200) def predict(self, X, y=None): return self.classifier.predict(X) def evaluate(self, X, Y): return self.classifier.evaluate(x=X, y=Y)["accuracy"] def score(self, X, y, sample_weight=None): return super(DNN, self).score(X, y, sample_weight)
def get_classifier(): # (kernel_size * kernel_size, 3) feature_columns = [layers.real_valued_column("", dimension=3)] return DNNClassifier( feature_columns=feature_columns, hidden_units=[256, 128], n_classes=5, model_dir="saved_model", # optimizer=AdadeltaOptimizer(learning_rate=0.1) # optimizer=AdamOptimizer() # dropout=0.5 )
def main(): # If the training and test sets aren't stored locally, download them. if not os.path.exists(IRIS_TRAINING): raw = urlopen(IRIS_TRAINING_URL).read() with open(IRIS_TRAINING, "wb") as f: f.write(raw) if not os.path.exists(IRIS_TEST): raw = urlopen(IRIS_TEST_URL).read() with open(IRIS_TEST, "wb") as f: f.write(raw) # Load datasets. training_set = load_csv_with_header(filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = load_csv_with_header(filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) # Specify that all features have real-value data feature_columns = [real_valued_column("", dimension=4)] # Build 3 layer DNN with 10, 20, 10 units respectively. classifier = DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir="/tmp/iris_model") # Define the training inputs def get_train_inputs(): x = tf.constant(training_set.data) y = tf.constant(training_set.target) return x, y # Fit model. classifier.fit(input_fn=get_train_inputs, steps=2000) # Define the test inputs def get_test_inputs(): x = tf.constant(test_set.data) y = tf.constant(test_set.target) return x, y # Evaluate accuracy. accuracy_score = classifier.evaluate(input_fn=get_test_inputs, steps=1)["accuracy"] print("\nTest Accuracy: {0:f}\n".format(accuracy_score)) # Classify two new flower samples. def new_samples(): return np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) predictions = list(classifier.predict(input_fn=new_samples)) print("New Samples, Class Predictions: {}\n".format(predictions))
def dnn_main(): x_train, x_test, y_train, y_test = load_SpamBase( "../data/spambase/spambase.data") feature_columns = infer_real_valued_columns_from_input(x_train) print(feature_columns) # hidden_units = [30, 10],表明具有两层隐藏层,每层节点数分别为30和10 classifier = DNNClassifier(feature_columns=feature_columns, hidden_units=[30, 10], n_classes=2) # steps=500表明训练500个批次,batch_size=10表明每个批次有10个训练数据。 # 一个epoch指的是使用全部数据集进行一次训练。进行训练时一个epoch可能更新了若干次参数。epoch_num为指定的epoch次数。 # 一个step或一次iteration指的是更新一次参数,每次更新使用数据集中的batch_size个数据。 # 注意: 使用相同的数据集,epoch也相同时,参数更新此时不一定是相同的,这时候会取决于batch_size。 # iteration或step的总数为(数据总数 / batch_size + 1) * epoch_num # 每个epoch都会进行shuffle,对要输入的数据进行重新排序,分成不同的batch。 classifier.fit(x_train, y_train, steps=500, batch_size=10) y_predict = list(classifier.predict(x_test, as_iterable=True)) #y_predict = classifier.predict(x_test) #print y_predict score = metrics.accuracy_score(y_test, y_predict) print('Accuracy: {0:f}'.format(score))
def train_model(item_type): model_dir = "models/" + item_type.lower().replace(" ", "_") if os.path.exists(model_dir): return print("==> Training model for '%s'" % item_type) csv_filename = filename = "data/" + item_type.lower().replace(" ", "_") + ".csv" df_all = pd.read_csv(csv_filename, skipinitialspace=True, encoding='utf-8') df_all.fillna(0.0, inplace=True) # Convert the price to a bucket representing a range df_all['price_chaos'] = (df_all['price_chaos'].apply(util.price_bucket)).astype(int) # Hash the item type to a number df_all['itemType'] = (df_all['itemType'].apply(lambda x: util.type_hash[x])).astype(float) LABEL_COLUMN = util.LABEL_COLUMN # Split the data 80/20 training/test percent_test = 20 n = (len(df_all) * percent_test)/100 df_train = df_all.head(len(df_all) - n) df_test = df_all.tail(n) train_x = df_train.ix[:, df_train.columns != LABEL_COLUMN].as_matrix().astype(float) train_y = df_train.as_matrix([LABEL_COLUMN]) test_x = df_test.ix[:, df_test.columns != LABEL_COLUMN].as_matrix().astype(float) test_y = df_test.as_matrix([LABEL_COLUMN]) deep_columns = tf.contrib.learn.infer_real_valued_columns_from_input(train_x) hidden_units = util.get_hidden_units(len(df_train.columns)-1) model = DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=hidden_units, n_classes=len(util.bins), enable_centered_bias=True) steps = len(df_train)/75 sessions = (steps/500)+2 for i in range(sessions): model.fit(train_x, train_y, steps=500, batch_size=5000) results = model.evaluate(test_x, test_y, steps=1, batch_size=df_test.size) # Print some predictions from the test data predictions = df_test.sample(10) v = model.predict_proba(predictions.ix[:, df_test.columns != LABEL_COLUMN].as_matrix().astype(float), batch_size=10) price_map = [] for i in v: # take the top 5 most likely price ranges top_largest = i.argsort()[-5:][::-1] prices = {} for p in top_largest: prices[util.get_bin_label(p)] = float(round(100*i[p], 1)) price_map.append(prices) for r in price_map: print r
def main(): training_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename='./iris_data/iris_training.csv', target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename='./iris_data/iris_test.csv', target_dtype=np.int, features_dtype=np.float32) feature_columns = [tf.feature_column.numeric_column("x", shape=[4])] clf = DNNClassifier(hidden_units=[10, 20, 10], feature_columns=feature_columns, model_dir='./iris_model', n_classes=3) train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": np.array(training_set.data)}, y=np.array(training_set.target), num_epochs=None, shuffle=True) clf.fit(input_fn=train_input_fn, steps=2000) test_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": np.array(test_set.data)}, y=np.array(test_set.target), num_epochs=1, shuffle=False) accuracy_score = clf.evaluate(input_fn=test_input_fn)["accuracy"] print("\nTest Accuracy: {0:f}\n".format(accuracy_score)) new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32) predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": new_samples}, num_epochs=1, shuffle=False) predictions = list(clf.predict(input_fn=predict_input_fn)) print predictions print("New Samples, Class Predictions: {}\n".format(predictions))
def audit_serving_input_fn(): return _serving_input_fn(audit_cont_columns, audit_cat_columns) def build_audit(classifier, max_steps, name, with_proba = True): classifier.fit(input_fn = audit_input_fn, max_steps = max_steps) adjusted = DataFrame(classifier.predict(input_fn = audit_input_fn, as_iterable = False), columns = ["_target"]) if(with_proba): adjusted_proba = DataFrame(classifier.predict_proba(input_fn = audit_input_fn, as_iterable = False), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv") store_savedmodel(classifier, audit_serving_input_fn, name) build_audit(DNNClassifier(hidden_units = [2 * 49], feature_columns = _dnn_feature_columns(audit_feature_columns), optimizer = tf.train.AdamOptimizer(learning_rate = 0.00001), config = estimator_conf), 2000, "DNNClassificationAudit") build_audit(LinearClassifier(feature_columns = audit_feature_columns, optimizer = tf.train.AdamOptimizer(learning_rate = 0.00025), config = estimator_conf), 5000, "LinearClassificationAudit") # # Multi-class classification # iris_df = load_csv("Iris.csv") iris_df["Species"] = iris_df["Species"].replace("setosa", "0").replace("versicolor", "1").replace("virginica", "2").astype(int) iris_cont_columns = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] iris_feature_columns = [real_valued_column(column, dtype = tf.float64) for column in iris_cont_columns] def iris_input_fn(): return _input_fn(iris_df, iris_cont_columns, [], "Species")
print('============================================================') for classifier, acc, cv_acc in results: print( 'Classifier = {}: Accuracy = {} || Mean Cross Val Accuracy scores = {}' .format(classifier, acc, cv_acc)) for name, bp in bestparams: print('============================================================') print('{}-classifier GridSearch Best Params'.format(name)) print('============================================================') display(bp) print() print() feature_columns = [ tf.contrib.layers.real_valued_column("", dimension=len(X[0])) ] dl_clf = DNNClassifier(hidden_units=[10, 20, 10], n_classes=2, feature_columns=feature_columns, model_dir="/tmp/ilpd") dl_clf.fit(X_train, y_train, steps=4000) predictions = list(dl_clf.predict(X_test, as_iterable=True)) acc = accuracy_score(y_test, predictions) print('============================================================') print('Classifier = {}: Accuracy = {} '.format(DNNClassifier, acc)) print('============================================================') print('{}-classifier GridSearch Best Params'.format(DNNClassifier)) display(dl_clf.params) print('============================================================')
# Need to set Kernel to Python 3 import pandas from tensorflow.contrib.learn import DNNClassifier from tensorflow.contrib.learn import SKCompat #from tensorflow.contrib.RunConfig import RunConfig from tensorflow.contrib.learn import infer_real_valued_columns_from_input #config = RunConfig(tf_random_seed = 42) # Extracting features from the training data feature_columns = infer_real_valued_columns_from_input(X_train) # Create the DNN with two hidden layers (300 neurons and 100 neurons) dnn_clf = DNNClassifier(hidden_units=[300, 100], n_classes=10, feature_columns=feature_columns) #config = config) #Wrapper dnn_clf = SKCompat(dnn_clf) # Train DNN with mini-batch descent dnn_clf.fit(X_train, y_train, batch_size=64, steps=5000) # In[1]: # VizWiz daatset import os import json from pprint import pprint
mnist.keys() #%% X=mnist['data'] y=mnist['target'] y=y.astype(np.int32) y #%% feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X) #%% feature_columns #%% from tensorflow.contrib.learn import DNNClassifier model=DNNClassifier(hidden_units=[300,100],feature_columns=feature_columns,n_classes=10,) model.fit(x=X,y=y,batch_size=50,steps=40000) #%% # 如果你在 MNIST 数据集上运行这个代码(在缩放它之后,例如,通过使用 skLearn # 的 StandardScaler ),你实际上可以得到一个在测试集上达到 98.1% 以上精度的模型!这比 # 我们在第 3 章中训练的最好的模型都要好: from sklearn.metrics import accuracy_score y_pred=list(model.predict(X)) print(accuracy_score(y,y_pred)) #%% # TF.Learn 学习库也为评估模型提供了一些方便的功能 model.evaluate(X,y) #%%
results, dataframes, best_parameters = parameter_tuning(models, X_train, X_test, y_train, y_test) print() print('============================================================') for classifier, acc, cv_acc in results: print('{}: Accuracy with Best Parameters = {}% || Mean Cross Validation Accuracy = {}%'.format(classifier, round(acc*100,4), round(cv_acc*100,4))) print() for name, bp in best_parameters: print('============================================================') print('{} classifier GridSearch Best Parameters'.format(name)) display(bp) print() print() # Deep Learning using Tensor flow feature_columns = [tf.contrib.layers.real_valued_column("", dimension=len(X[0]))] deep_learning = DNNClassifier(hidden_units=[10,20,10], feature_columns=feature_columns, model_dir="/tmp/iris") deep_learning.fit(X_train, y_train, steps=1500) predictions = list(deep_learning.predict(X_test, as_iterable=True)) acc = accuracy_score(predictions, predictions) print('============================================================') print('Deep Learning classifier Accuracy = ', round(acc*100,4),'%') print('------------------------------------------------------------') print('Deep Learning classifier Best Parameters') display(deep_learning.params) print('***************** Execution Completed **********************') print('------------------------------------------------------------')
def build_estimator(model_dir, model_type): """Build an estimator.""" # Sparse base columns. clickTime = tf.contrib.layers.sparse_column_with_integerized_feature( "clickTime", bucket_size=24) # creativeID = tf.contrib.layers.sparse_column_with_integerized_feature( # "creativeID", bucket_size=7000) positionID = tf.contrib.layers.sparse_column_with_integerized_feature( "positionID", bucket_size=7646) connectionType = tf.contrib.layers.sparse_column_with_integerized_feature( "connectionType", bucket_size=5) telecomsOperator = tf.contrib.layers.sparse_column_with_integerized_feature( "telecomsOperator", bucket_size=4) age = tf.contrib.layers.sparse_column_with_integerized_feature( "age", bucket_size=81) gender = tf.contrib.layers.sparse_column_with_integerized_feature( "gender", bucket_size=3) education = tf.contrib.layers.sparse_column_with_integerized_feature( "education", bucket_size=8) marriageStatus = tf.contrib.layers.sparse_column_with_integerized_feature( "marriageStatus", bucket_size=4) haveBaby = tf.contrib.layers.sparse_column_with_integerized_feature( "haveBaby", bucket_size=7) hometown = tf.contrib.layers.sparse_column_with_integerized_feature( "hometown", bucket_size=365) residence = tf.contrib.layers.sparse_column_with_integerized_feature( "residence", bucket_size=400) adID = tf.contrib.layers.sparse_column_with_integerized_feature( "adID", bucket_size=3616) camgaignID = tf.contrib.layers.sparse_column_with_integerized_feature( "camgaignID", bucket_size=720) advertiserID = tf.contrib.layers.sparse_column_with_integerized_feature( "advertiserID", bucket_size=91) appPlatform = tf.contrib.layers.sparse_column_with_integerized_feature( "appPlatform", bucket_size=3) appCategory = tf.contrib.layers.sparse_column_with_integerized_feature( "appCategory", bucket_size=504) wide_columns = [ clickTime, positionID, connectionType, telecomsOperator, age, gender, education, marriageStatus, haveBaby, hometown, residence, adID, camgaignID, advertiserID, appPlatform, appCategory, # tf.contrib.layers.crossed_column([education, occupation], # hash_bucket_size=int(1e4)), # tf.contrib.layers.crossed_column( # [age_buckets, education, occupation], # hash_bucket_size=int(1e6)), tf.contrib.layers.crossed_column( [clickTime, connectionType, telecomsOperator], hash_bucket_size=int(1e4)) ] deep_columns = [ tf.contrib.layers.embedding_column(clickTime, dimension=8), tf.contrib.layers.embedding_column(positionID, dimension=8), tf.contrib.layers.embedding_column(connectionType, dimension=8), tf.contrib.layers.embedding_column(telecomsOperator, dimension=8), tf.contrib.layers.embedding_column(age, dimension=8), tf.contrib.layers.embedding_column(gender, dimension=8), tf.contrib.layers.embedding_column(education, dimension=8), tf.contrib.layers.embedding_column(marriageStatus, dimension=8), tf.contrib.layers.embedding_column(haveBaby, dimension=8), tf.contrib.layers.embedding_column(hometown, dimension=8), tf.contrib.layers.embedding_column(residence, dimension=8), tf.contrib.layers.embedding_column(adID, dimension=8), tf.contrib.layers.embedding_column(camgaignID, dimension=8), tf.contrib.layers.embedding_column(advertiserID, dimension=8), tf.contrib.layers.embedding_column(appCategory, dimension=8), tf.contrib.layers.embedding_column(appPlatform, dimension=8) ] if model_type == "wide": m = LinearClassifier(model_dir=model_dir, feature_columns=wide_columns) elif model_type == "deep": m = DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=[100, 50]) else: m = DNNLinearCombinedClassifier(model_dir=model_dir, linear_feature_columns=wide_columns, dnn_feature_columns=deep_columns, dnn_hidden_units=[100, 50], fix_global_step_increment_bug=True) return m
adjusted = DataFrame(classifier.predict(input_fn=audit_input_fn, as_iterable=False), columns=["_target"]) if (with_proba): adjusted_proba = DataFrame( classifier.predict_proba(input_fn=audit_input_fn, as_iterable=False), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv") store_savedmodel(classifier, audit_serving_input_fn, name) build_audit( DNNClassifier(hidden_units=[71, 11], feature_columns=_dnn_feature_columns(audit_feature_columns)), "DNNClassificationAudit") build_audit(LinearClassifier(feature_columns=audit_feature_columns), "LinearClassificationAudit") # # Multi-class classification # iris_df = load_csv("Iris.csv") iris_df["Species"] = iris_df["Species"].replace("setosa", "0").replace( "versicolor", "1").replace("virginica", "2").astype(int) iris_cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ]
l.remove(l[0]) l = np.array(l) labels = l[:, :1] data = l[:, 1:] return to_int(data), formalize(to_int(labels), 10) def load_test_data(): l = [] with open("test.csv") as f: lines = csv.reader(f) for line in lines: l.append(line) l.remove(l[0]) return to_int(l) train_images, train_labels = load_train_data() test_images = load_test_data() print(train_images[0]) feature_columns = infer_real_valued_columns_from_input(train_images) clf = DNNClassifier([100], feature_columns, n_classes=10) print(train_images.shape) print(train_labels.shape) clf.fit(train_images, train_labels) print("done training") pred = clf.predict(test_images[0]) print(pred)
type=str) parse = parser.parse_args() TRAIN_DATASET = parse.train TEST_DATASET = parse.test OUTPUT_PATH = parse.output np.random.seed(19260817) train_set = pandas.read_csv(TRAIN_DATASET) test_set = pandas.read_csv(TEST_DATASET) encoder = LabelEncoder().fit(train_set["species"]) train = train_set.drop(["species", "id"], axis=1).values label = encoder.transform(train_set["species"]) test = test_set.drop(["id"], axis=1).values scaler = StandardScaler().fit(train) train = scaler.transform(train) scaler = StandardScaler().fit(test) test = scaler.transform(test) feature_columns = [real_valued_column("", dimension=192)] classifier = DNNClassifier(feature_columns=feature_columns, n_classes=99, hidden_units=[1024, 512, 256], optimizer=tf.train.AdamOptimizer) classifier.fit(x=train, y=label, steps=1000) output = classifier.predict(test) output_prob = classifier.predict_proba(test) test_id = test_set.pop("id") result = pandas.DataFrame(output_prob, index=test_id, columns=encoder.classes_) result.to_csv(OUTPUT_PATH)