def get_adanet_model(): # Estimator configuration. runConfig = tf.estimator.RunConfig(save_checkpoints_steps=100, save_summary_steps=100, tf_random_seed=RANDOM_SEED) estimator = adanet.Estimator( model_dir=OUTPUT_DIR, # adanet_loss_decay=0.99, head=tf.contrib.estimator.binary_classification_head(), subnetwork_generator=simple_dnn.Generator( learn_mixture_weights=True, dropout=CONFIG["DROPOUT"], feature_columns=bidding_data. get_feature_columns_for_imp_prediction(), optimizer=tf.train.RMSPropOptimizer( learning_rate=ADANET_LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=NUM_EPOCHS // ADANET_ITERATIONS, evaluator=adanet.Evaluator( input_fn=lambda: bidding_data.validation_input_fn_for_predict_imp( batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS), steps=EVAL_STEPS), config=runConfig) return estimator
def get_adanet_model(): LEARNING_RATE = 0.003 #@param {type:"number"} TRAIN_STEPS = NUM_EPOCHS #@param {type:"integer"} # BATCH_SIZE = 64 #@param {type:"integer"} ADANET_ITERATIONS = 8 #@param {type:"integer"} RANDOM_SEED = 42 # Estimator configuration. runConfig = tf.estimator.RunConfig(save_checkpoints_steps=100, save_summary_steps=100, tf_random_seed=RANDOM_SEED) classifier = estimator = adanet.Estimator( model_dir=OUTPUT_DIR, adanet_loss_decay=0.99, head=tf.contrib.estimator.binary_classification_head(), subnetwork_generator=simple_dnn.Generator( learn_mixture_weights=True, dropout=0.5, feature_columns=bidding_data.get_feature_columns_for_wr_prediction( ), optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator( input_fn=lambda: bidding_data.validation_input_fn_for_predict_wr( batch_size=BATCH_SIZE, num_epochs=TRAIN_STEPS), steps=EVAL_STEPS), config=runConfig) return classifier
def test_constructor_errors(self, feature_columns, layer_size=3, initial_num_layers=0): with self.assertRaises(ValueError): simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(.1), layer_size=layer_size, initial_num_layers=initial_num_layers)
def build_subnetwork_generator(self): feature_columns = [ tf.feature_column.numeric_column(self.FEATURE_KEY, shape=[28, 28, 1]) ] return simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.train.AdamOptimizer(self.learning_rate), seed=SEED, )
def dnn_ada(): print("==============================================") start = datetime.datetime.now() print("Start Train Adanet with [DNN Model] on Mnist at %s" % time_str(start)) print("- - - - - - - - - - - - - - - - - - - - - - - -") LEARNING_RATE = 0.003 TRAIN_STEPS = 5000 BATCH_SIZE = 64 ADANET_ITERATIONS = 2 model_dir = os.path.join(LOG_DIR, "dnn_%s" % time_str(start)) config = tf.estimator.RunConfig(save_checkpoints_steps=50000, save_summary_steps=50000, tf_random_seed=RANDOM_SEED, model_dir=model_dir) estimator = adanet.Estimator( head=head, subnetwork_generator=simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator(input_fn=input_fn("train", training=False, batch_size=BATCH_SIZE), steps=None), config=config) results, _ = tf.estimator.train_and_evaluate( estimator, train_spec=tf.estimator.TrainSpec(input_fn=input_fn( "train", training=True, batch_size=BATCH_SIZE), max_steps=TRAIN_STEPS), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn( "test", training=False, batch_size=BATCH_SIZE), steps=None)) print("Accuracy:", results["accuracy"]) print("Loss:", results["average_loss"]) end = datetime.datetime.now() print("Training end at %s" % time_str(end)) print("Time Spend %s" % str(end - start)) print("==============================================")
def get_estimator(self): estimator = adanet.Estimator( head=self.head, subnetwork_generator=simple_dnn.Generator( feature_columns=self.feature_columns, optimizer=tf.train.RMSPropOptimizer( learning_rate=self.LEARNING_RATE), seed=self.RANDOM_SEED), max_iteration_steps=self.TRAIN_STEPS // self.ADANET_ITERATIONS, evaluator=adanet.Evaluator(input_fn=self.input_fn( "train", training=False, batch_size=self.BATCH_SIZE, RANDOM_SEED=self.RANDOM_SEED), steps=None), config=self.config) return estimator
def get_adanet_model(): # Estimator configuration. # distribution_strategy = tf.contrib.distribute.MirroredStrategy() session_config = tf.ConfigProto(log_device_placement=True) session_config.gpu_options.allow_growth = True session_config.gpu_options.per_process_gpu_memory_fraction = 0.8 runConfig = tf.estimator.RunConfig( # train_distribute=distribution_strategy, # eval_distribute=distribution_strategy, session_config=session_config, save_checkpoints_steps=100, save_summary_steps=100, tf_random_seed=RANDOM_SEED) estimator = adanet.Estimator( model_dir=OUTPUT_DIR, # metric_fn=custom_metrics, # adanet_loss_decay=0.99, head=tf.contrib.estimator.multi_label_head( name="name", n_classes=len(CONFIG['LABELS']), # classes_for_class_based_metrics= [5,6] ), subnetwork_generator=simple_dnn.Generator( learn_mixture_weights=True, dropout=CONFIG["DROPOUT"], feature_columns=data.get_feature_columns(), optimizer=tf.train.AdamOptimizer( learning_rate=ADANET_LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=NUM_EPOCHS // ADANET_ITERATIONS, evaluator=adanet.Evaluator(input_fn=lambda: data.validation_input_fn( batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS), steps=EVAL_STEPS), config=runConfig) return estimator
def test_generate_candidates(self, want_names, want_subnetwork_losses, want_mixture_weight_losses, want_complexities, learn_mixture_weights=False, initial_num_layers=0, previous_ensemble=None): feature_columns = [tf.feature_column.numeric_column("x")] generator = simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.train.GradientDescentOptimizer(.1), layer_size=3, initial_num_layers=initial_num_layers, learn_mixture_weights=learn_mixture_weights, seed=42) with tf.Graph().as_default() as g: iteration_step = tf.train.create_global_step() features = {"x": [[1.], [2.]]} labels = tf.constant([[0.], [1.]]) names = [] subnetwork_losses = [] mixture_weight_losses = [] complexities = [] for builder in generator.generate_candidates( previous_ensemble, # The following arguments are not used by # simple_dnn.BuilderGenerator's generate_candidates. iteration_number=0, previous_ensemble_reports=[], all_reports=[]): names.append(builder.name) # 1. Build subnetwork graph. subnetwork = builder.build_subnetwork( features, logits_dimension=1, training=True, iteration_step=iteration_step, summary=tf.summary, previous_ensemble=previous_ensemble) # 2. Build subnetwork train ops. subnetwork_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=subnetwork.logits, labels=labels)) subnetwork_train_op = builder.build_subnetwork_train_op( subnetwork, subnetwork_loss, var_list=None, labels=labels, iteration_step=iteration_step, summary=tf.summary, previous_ensemble=None) # 3. Build mixture weight train ops. # Stop gradients since mixture weights should have not propagate # beyond top layer. subnetwork_logits = tf.stop_gradient(subnetwork.logits) # Mixture weight will initialize to a one-valued scalar. mixture_weight_logits = tf.layers.dense( subnetwork_logits, units=1, use_bias=False, kernel_initializer=tf.ones_initializer()) mixture_weight_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=mixture_weight_logits, labels=labels)) mixture_weight_train_op = builder.build_mixture_weights_train_op( mixture_weight_loss, var_list=None, labels=labels, logits=mixture_weight_logits, iteration_step=iteration_step, summary=tf.summary) with self.test_session(graph=g) as sess: sess.run(tf.global_variables_initializer()) sess.run(subnetwork_train_op) sess.run(mixture_weight_train_op) subnetwork_losses.append(sess.run(subnetwork_loss)) mixture_weight_losses.append(sess.run(mixture_weight_loss)) complexities.append(sess.run(subnetwork.complexity)) self.assertEqual(want_names, names) self.assertAllClose(want_subnetwork_losses, subnetwork_losses, atol=1e-3) self.assertAllClose(want_mixture_weight_losses, mixture_weight_losses, atol=1e-3) self.assertAllClose(want_complexities, complexities, atol=1e-3)
def map_fun(args, ctx): from datetime import datetime import tensorflow as tf import os import time import json import adanet from adanet.examples import simple_dnn worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index message = "worker_num: {0}, job_name: {1}, task_index: {2}".format( worker_num, job_name, task_index) print(message) input_dim = int(args.input_dim) batch_size = args.batch_size # Fix Random Seed RANDOM_SEED = 42 FEATURES_KEY = "features" NUM_CLASSES = 2 loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE # head = tf.contrib.estimator.multi_class_head(NUM_CLASSES, loss_reduction=loss_reduction) head = tf.contrib.estimator.binary_classification_head( loss_reduction=loss_reduction) # numeric_column do not support SparseTensor feature_columns = [ tf.feature_column.numeric_column(key=FEATURES_KEY, shape=[input_dim]) ] log_dir = ctx.absolute_path(args.log_dir) export_dir = ctx.absolute_path(args.export_dir) pred_dir = ctx.absolute_path(args.prediction_dir) print("tensorflow log path: {0}".format(log_dir)) print("tensorflow export path: {0}".format(export_dir)) print("tensorflow prediction path: {0}".format(pred_dir)) def generator(ln): splits = tf.string_split([ln], delimiter=" ") label = splits.values[0] label = tf.string_to_number(label, tf.float64) label = tf.cond( label >= 1.0, lambda: tf.constant(1, shape=[1], dtype=tf.float32), lambda: tf.constant(0, shape=[1], dtype=tf.float32), ) # SparseTensor output col_val = tf.string_split(splits.values[1::], delimiter=":") col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1 vals = col_val.values[1::2] vals = tf.string_to_number(vals, tf.float32) # Filter the features which occurs few than given input_dim vals = tf.boolean_mask(vals, col < input_dim) col = tf.boolean_mask(col, col < input_dim) row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name="row_cast") row_col = tf.transpose(tf.stack([row, col]), name="row_col_transpose") sparse = tf.SparseTensor(row_col, vals, (1, input_dim)) # convert to dense,191106 必须转 features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)} return features, label def new_input_fn(partition, training): def _input_fn(): # path is ok parse_fn = generator if partition == "train": data_dir = ctx.absolute_path(args.data_dir) file_pattern = os.path.join(data_dir, "part-*") ds = tf.data.Dataset.list_files(file_pattern, shuffle=False) ds = ds.apply( tf.contrib.data.parallel_interleave( tf.data.TextLineDataset, cycle_length=10)) ds = ds.map(parse_fn, num_parallel_calls=5) if training: ds = ds.shuffle(batch_size * 5).repeat() else: data_dir = ctx.absolute_path(args.test_dir) file_pattern = os.path.join(data_dir, "part-*") ds = tf.data.Dataset.list_files(file_pattern, shuffle=False) ds = ds.apply( tf.contrib.data.parallel_interleave( tf.data.TextLineDataset, cycle_length=10)) ds = ds.map(parse_fn, num_parallel_calls=5) iterator = ds.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels # ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)) # return ds.batch(batch_size) return _input_fn print("========= Start Training") LEARNING_RATE = 0.01 TRAIN_STEPS = 3000 ADANET_ITERATIONS = 3 # AKA Boosting Iteration # 控制模型复杂度 ADANET_LAMBDA = 0.1 LEARN_MIXTURE_WEIGHTS = False #strategy = adanet.distributed.RoundRobinStrategy() # 191125 这里一定要设置 tfc = json.dumps({ "cluster": ctx.cluster_spec, "task": { "type": job_name, "index": task_index } }) os.environ["TF_CONFIG"] = tfc # 191127 尝试不用 device_filter,用了 strategy 后会自动设置为 /job:ps,不需要时候手动设置 config = tf.estimator.RunConfig( save_checkpoints_steps=5000, tf_random_seed=RANDOM_SEED, model_dir=log_dir, ) # config = tf.estimator.RunConfig( # save_checkpoints_steps=5000, # tf_random_seed=RANDOM_SEED, # model_dir=logdir, # session_config=tf.ConfigProto( # log_device_placement=False, device_filters=["/job:ps"] # ), # ) # BaseLine Linear # estimator = tf.estimator.LinearClassifier( # feature_columns=feature_columns, # n_classes=NUM_CLASSES, # optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), # loss_reduction=loss_reduction, # config=config # ) # DNN TEST - ADANET estimator = adanet.Estimator( head=head, force_grow=True, subnetwork_generator=simple_dnn.Generator( layer_size=128, initial_num_layers=2, dropout=0.2, feature_columns=feature_columns, optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), learn_mixture_weights=LEARN_MIXTURE_WEIGHTS, seed=RANDOM_SEED, ), adanet_lambda=ADANET_LAMBDA, max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, #evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False)), evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False), steps=1000), config=config, #experimental_placement_strategy=strategy, # 记录 report,实际上没啥用 # report_materializer=adanet.ReportMaterializer( # input_fn=new_input_fn("train", False), # ), ) # 尝试不 return 任何东西,只是计算 tf.estimator.train_and_evaluate( estimator, train_spec=tf.estimator.TrainSpec(input_fn=new_input_fn("train", True), max_steps=TRAIN_STEPS), # 这里的 Eval 在分布式场景下,实际上并没有任何作用 eval_spec=tf.estimator.EvalSpec( input_fn=new_input_fn("test", False), steps=None, start_delay_secs=1, throttle_secs=30, ), ) # 最后一轮只训练,模型参数会保存到 model.ckpt,并不会再为下一轮去做准备 # 参考 https://github.com/tensorflow/adanet/blob/master/adanet/core/estimator_test.py # line 2362 def test_export_saved_model_always_uses_replication_placement(self): def serving_input_receiver_fn(): serialized_sample = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, input_dim], name='features') tensor_features = {'features': serialized_sample} return tf.estimator.export.ServingInputReceiver( features=tensor_features, receiver_tensors=serialized_sample) # 在 RoundRobinStrategy 下无法执行 if ctx.job_name == "chief": # 进行预测,分别是测试和训练 print('export test result') predictions = estimator.predict(new_input_fn("test", False)) print('Writing Predictions to {}'.format(pred_dir)) tf.gfile.MakeDirs(pred_dir) with tf.gfile.GFile("{}/test".format(pred_dir), 'w') as f: for pred in predictions: f.write(str(pred)) f.write('\n') print('export train result') predictions = estimator.predict(new_input_fn("train", False)) print('Writing Predictions to {}'.format(pred_dir)) tf.gfile.MakeDirs(pred_dir) with tf.gfile.GFile("{}/train".format(pred_dir), 'w') as f: for pred in predictions: f.write(str(pred)) f.write('\n') # 导出模型 estimator.export_saved_model( export_dir, serving_input_receiver_fn, experimental_mode=tf.estimator.ModeKeys.PREDICT)
def map_fun(args, ctx): from datetime import datetime import tensorflow as tf import os import time import json import adanet from adanet.examples import simple_dnn worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index message = "worker_num: {0}, job_name: {1}, task_index: {2}".format( worker_num, job_name, task_index) print(message) input_dim = int(args.input_dim) batch_size = args.batch_size # Fix Random Seed RANDOM_SEED = 42 FEATURES_KEY = "features" loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE def weighted_cross_entropy_with_logits(labels, logits): return tf.nn.weighted_cross_entropy_with_logits(targets=labels, logits=logits, pos_weight=4) head = tf.contrib.estimator.binary_classification_head( loss_reduction=loss_reduction, loss_fn=weighted_cross_entropy_with_logits) # numeric_column do not support SparseTensor feature_columns = [ tf.feature_column.numeric_column(key=FEATURES_KEY, shape=[input_dim]) ] log_dir = ctx.absolute_path(args.log_dir) export_dir = ctx.absolute_path(args.export_dir) pred_dir = ctx.absolute_path(args.prediction_dir) print("tensorflow log path: {0}".format(log_dir)) print("tensorflow export path: {0}".format(export_dir)) print("tensorflow prediction path: {0}".format(pred_dir)) def generator(ln): splits = tf.string_split([ln], delimiter=" ") label = splits.values[0] label = tf.string_to_number(label, tf.float64) label = tf.cond( label >= 1.0, lambda: tf.constant(1, shape=[1], dtype=tf.float32), lambda: tf.constant(0, shape=[1], dtype=tf.float32), ) # SparseTensor output col_val = tf.string_split(splits.values[1::], delimiter=":") col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1 vals = col_val.values[1::2] vals = tf.string_to_number(vals, tf.float32) # Filter the features which occurs few than given input_dim vals = tf.boolean_mask(vals, col < input_dim) col = tf.boolean_mask(col, col < input_dim) row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name="row_cast") row_col = tf.transpose(tf.stack([row, col]), name="row_col_transpose") sparse = tf.SparseTensor(row_col, vals, (1, input_dim)) # convert to dense,191106 必须转 features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)} return features, label def new_input_fn(partition, training): def _input_fn(): # path is ok parse_fn = generator if partition == "train": data_dir = ctx.absolute_path(args.data_dir) file_pattern = os.path.join(data_dir, "part-*") ds = tf.data.Dataset.list_files(file_pattern, shuffle=False) ds = ds.apply( tf.contrib.data.parallel_interleave( tf.data.TextLineDataset, cycle_length=10)) ds = ds.map(parse_fn, num_parallel_calls=5) if training: ds = ds.shuffle(batch_size * 5).repeat() else: data_dir = ctx.absolute_path(args.test_dir) file_pattern = os.path.join(data_dir, "part-*") ds = tf.data.Dataset.list_files(file_pattern, shuffle=False) ds = ds.apply( tf.contrib.data.parallel_interleave( tf.data.TextLineDataset, cycle_length=10)) ds = ds.map(parse_fn, num_parallel_calls=5) iterator = ds.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels # ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)) # return ds.batch(batch_size) return _input_fn print("========= Start Training") LEARNING_RATE = 0.01 TRAIN_STEPS = 1000 ADANET_ITERATIONS = 4 # AKA Boosting Iteration # 控制模型复杂度 ADANET_LAMBDA = 0.1 LEARN_MIXTURE_WEIGHTS = False #strategy = adanet.distributed.RoundRobinStrategy() # 191125 这里一定要设置 tfc = json.dumps({ "cluster": ctx.cluster_spec, "task": { "type": job_name, "index": task_index } }) os.environ["TF_CONFIG"] = tfc # 191127 尝试不用 device_filter,用了 strategy 后会自动设置为 /job:ps,不需要时候手动设置 config = tf.estimator.RunConfig( save_checkpoints_steps=5000, tf_random_seed=RANDOM_SEED, model_dir=log_dir, ) # estimator = tf.estimator.LinearEstimator( # head=head, # feature_columns=feature_columns, # config=config # # ) # config = tf.estimator.RunConfig( # save_checkpoints_steps=5000, # tf_random_seed=RANDOM_SEED, # model_dir=logdir, # session_config=tf.ConfigProto( # log_device_placement=False, device_filters=["/job:ps"] # ), # ) # DNN TEST - ADANET estimator = adanet.Estimator( head=head, force_grow=False, subnetwork_generator=simple_dnn.Generator( layer_size=128, initial_num_layers=1, dropout=0.2, feature_columns=feature_columns, optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), learn_mixture_weights=LEARN_MIXTURE_WEIGHTS, seed=RANDOM_SEED, ), adanet_lambda=ADANET_LAMBDA, max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator(input_fn=new_input_fn("test", False), steps=1000), config=config, ) # ensemble_estimator = adanet.AutoEnsembleEstimator( # head=head, # candidate_pool= lambda config: { # "linear1": # tf.estimator.LinearEstimator( # head=head, # feature_columns=feature_columns, # optimizer=tf.train.RMSPropOptimizer(learning_rate=0.1), # config=config, # ), # "dnn1": # tf.estimator.DNNEstimator( # head=head, # feature_columns=feature_columns, # optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001), # hidden_units=[512, 256, 128], # config=config, # ), # "dnn2": # tf.estimator.DNNEstimator( # head=head, # feature_columns=feature_columns, # optimizer=tf.train.RMSPropOptimizer(learning_rate=0.01), # hidden_units=[256, 128], # config=config, # ), # "dnn_linear": # tf.estimator.DNNLinearCombinedEstimator( # head=head, # dnn_feature_columns=feature_columns, # linear_feature_columns=feature_columns, # dnn_hidden_units=[512, 256, 128], # config=config, # ) # }, # max_iteration_steps=100, # ) cur_e = estimator # 尝试不 return 任何东西,只是计算 tf.estimator.train_and_evaluate( cur_e, train_spec=tf.estimator.TrainSpec(input_fn=new_input_fn("train", True), max_steps=TRAIN_STEPS), # 这里的 Eval 在分布式场景下,实际上并没有任何作用 eval_spec=tf.estimator.EvalSpec( input_fn=new_input_fn("test", False), steps=None, start_delay_secs=1, throttle_secs=30, ), ) # 最后一轮只训练,模型参数会保存到 model.ckpt,并不会再为下一轮去做准备 # 这样的保存方式,需要输入是一个 example,不适合 DSP 的输入 # feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) # serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) def serving_input_receiver_fn(): indices = tf.placeholder(dtype=tf.int64, shape=[None, None], name='indices') values = tf.placeholder(dtype=tf.float32, shape=[None], name='values') shape = tf.placeholder(dtype=tf.int64, shape=[None], name='dense_shape') receiver_input = { 'indices': indices, 'values': values, 'dense_shape': shape } # 先构成 sparse,然后 sparse_to_dense sparse = tf.SparseTensor(indices, values, shape) features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)} return tf.estimator.export.ServingInputReceiver( features, receiver_input) # 在 RoundRobinStrategy 下无法执行 if ctx.job_name == "chief": # 进行 evaluate,比较慢,跳过 # predictions = cur_e.predict(new_input_fn("test", False)) # result = cur_e.evaluate(new_input_fn("test", False)) # with tf.gfile.GFile("{}/evaluate".format(log_dir), 'w') as f: # f.write(str(result)) # f.write('\n') # 进行预测,分别是测试和训练 # print('export test result') # predictions = estimator.predict(new_input_fn("test", False)) # print('Writing Predictions to {}'.format(pred_dir)) # tf.gfile.MakeDirs(pred_dir) # with tf.gfile.GFile("{}/test".format(pred_dir), 'w') as f: # for pred in predictions: # f.write(str(pred)) # f.write('\n') # print('export train result') # predictions = estimator.predict(new_input_fn("train", False)) # print('Writing Predictions to {}'.format(pred_dir)) # tf.gfile.MakeDirs(pred_dir) # with tf.gfile.GFile("{}/train".format(pred_dir), 'w') as f: # for pred in predictions: # f.write(pred['classes'][0]) # f.write('\n') # 导出模型 # 191204 这样导出没有办法指定 serving 时的输出, cur_e.export_saved_model(export_dir, serving_input_receiver_fn)
def map_fun_v2(args, ctx): from datetime import datetime import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index # Parameters IMAGE_PIXELS = 28 hidden_units = 128 # Fix Random Seed RANDOM_SEED = 42 (x_train, y_train), (x_test, y_test) = (tf.keras.datasets.mnist.load_data()) FEATURES_KEY = "images" NUM_CLASSES = 10 loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE head = tf.contrib.estimator.multi_class_head(NUM_CLASSES, loss_reduction=loss_reduction) feature_columns = [ tf.feature_column.numeric_column(FEATURES_KEY, shape=[28, 28, 1]) ] # Get TF cluster and server instances cluster, server = ctx.start_cluster_server(1, args.rdma) def generator(images, labels): """Returns a generator that returns image-label pairs.""" def _gen(): for image, label in zip(images, labels): yield image, label return _gen def preprocess_image(image, label): """Preprocesses an image for an `Estimator`.""" image = image / 255. image = tf.reshape(image, [28, 28, 1]) features = {FEATURES_KEY: image} return features, label def input_fn(partition, training): """Generate an input_fn for the Estimator.""" def _input_fn(): if partition == "train": dataset = tf.data.Dataset.from_generator( generator(x_train, y_train), (tf.float32, tf.int32), ((28, 28), ())) else: dataset = tf.data.Dataset.from_generator( generator(x_test, y_test), (tf.float32, tf.int32), ((28, 28), ())) if training: dataset = dataset.shuffle(10 * args.batch_size, seed=RANDOM_SEED).repeat() dataset = dataset.map(preprocess_image).batch(args.batch_size) iterator = dataset.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels return _input_fn if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default # 这里的日志都是看不到的 message = "" with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): print("========= Start Training") LEARNING_RATE = 0.003 TRAIN_STEPS = 5000 BATCH_SIZE = 64 ADANET_ITERATIONS = 2 logdir = ctx.absolute_path(args.model) config = tf.estimator.RunConfig(save_checkpoints_steps=50000, save_summary_steps=50000, tf_random_seed=RANDOM_SEED, model_dir=logdir) # 先测试下线性模型 # estimator = tf.estimator.LinearClassifier( # feature_columns=feature_columns, # n_classes=NUM_CLASSES, # optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), # loss_reduction=loss_reduction, # config=config # ) estimator = adanet.Estimator( head=head, subnetwork_generator=simple_dnn.Generator( feature_columns=feature_columns, optimizer=tf.train.RMSPropOptimizer( learning_rate=LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator(input_fn=input_fn("train", training=False), steps=None), config=config) results, _ = tf.estimator.train_and_evaluate( estimator, train_spec=tf.estimator.TrainSpec(input_fn=input_fn( "train", training=True), max_steps=TRAIN_STEPS), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn( "test", training=False), steps=None)) print("Accuracy:", results["accuracy"]) print("Loss:", results["average_loss"]) message = "Accuracy: {}; Loss: {}".format(results["accuracy"], results["average_loss"]) print("==============================================") print("{} stopping MonitoredTrainingSession".format( datetime.now().isoformat())) # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745 # wait for all other nodes to complete (via done files) done_dir = "{}/{}/done".format(ctx.absolute_path(args.model), args.mode) print("Writing done file to: {}".format(done_dir)) tf.gfile.MakeDirs(done_dir) with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: done_file.write("done") done_file.write(message) for i in range(60): if len(tf.gfile.ListDirectory(done_dir)) < len( ctx.cluster_spec['worker']): print("{} Waiting for other nodes {}".format( datetime.now().isoformat(), i)) time.sleep(1) else: print("{} All nodes done".format(datetime.now().isoformat())) break
def map_fun(args, ctx): from datetime import datetime import tensorflow as tf import os import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index message = 'worker_num: {0}, job_name: {1}, task_index: {2}'.format(worker_num, job_name, task_index) input_dim = int(args.input_dim) batch_size = args.batch_size # Fix Random Seed RANDOM_SEED = 42 FEATURES_KEY = "ctr" NUM_CLASSES = 2 loss_reduction = tf.losses.Reduction.SUM_OVER_BATCH_SIZE # head = tf.contrib.estimator.multi_class_head(NUM_CLASSES, loss_reduction=loss_reduction) head = tf.contrib.estimator.binary_classification_head(loss_reduction=loss_reduction) # 用 numeric_column 是不支持 SparseTensor 的 feature_columns = [ tf.feature_column.numeric_column(FEATURES_KEY, shape=[input_dim]) ] log_dir = ctx.absolute_path(args.log_dir) export_dir = ctx.absolute_path(args.export_dir) print("tensorflow log path: {0}".format(log_dir)) print("tensorflow export path: {0}".format(export_dir)) # Get TF cluster and server instances cluster, server = ctx.start_cluster_server(1, args.rdma) def generator(ln): splits = tf.string_split([ln], delimiter=' ') label = splits.values[0] label = tf.string_to_number(label, tf.float64) label = tf.cond(label >= 1.0, lambda: tf.constant(1, shape=[1], dtype=tf.float32), lambda: tf.constant(0, shape=[1], dtype=tf.float32)) # SparseTensor output col_val = tf.string_split(splits.values[1::], delimiter=':') col = tf.string_to_number(col_val.values[0::2], tf.int64) - 1 vals = col_val.values[1::2] vals = tf.string_to_number(vals, tf.float32) # Filter the features which occurs few than given input_dim vals = tf.boolean_mask(vals, col < input_dim) col = tf.boolean_mask(col, col < input_dim) row = tf.cast(tf.fill(tf.shape(col), 0), tf.int64, name='row_cast') row_col = tf.transpose(tf.stack([row, col]), name='row_col_transpose') sparse = tf.SparseTensor(row_col, vals, (1, input_dim)) # 转换成 dense features = {FEATURES_KEY: tf.sparse_tensor_to_dense(sparse)} return features, label def input_fn(partition): """Generate an input_fn for the Estimator.""" def _input_fn(): num_workers = len(ctx.cluster_spec['worker']) data_dir = ctx.absolute_path(args.data_dir) file_pattern = os.path.join(data_dir, 'part-*') ds = tf.data.Dataset.list_files(file_pattern) ds = ds.shard(num_workers, task_index).repeat(args.epochs) if args.format == 'libsvm': ds = ds.apply(tf.contrib.data.parallel_interleave(tf.data.TextLineDataset, cycle_length=10)) parse_fn = generator if partition == "train": ds = ds.map(parse_fn, num_parallel_calls=5).shuffle(batch_size * 5) else: ds = ds.map(parse_fn, num_parallel_calls=5) ds = ds.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)).prefetch(100) iterator = ds.make_one_shot_iterator() features, labels = iterator.get_next() return features, labels return _input_fn if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default # 这里的日志都是看不到的 message = "" with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): print("========= Start Training") LEARNING_RATE = 0.003 TRAIN_STEPS = 1000 ADANET_ITERATIONS = 2 # 目前来看效果不是很好,还不如线性 logdir = ctx.absolute_path(args.log_dir) config = tf.estimator.RunConfig( save_checkpoints_steps=50000, save_summary_steps=50000, tf_random_seed=RANDOM_SEED, model_dir=logdir ) # BaseLine 线性模型 # estimator = tf.estimator.LinearClassifier( # feature_columns=feature_columns, # n_classes=NUM_CLASSES, # optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), # loss_reduction=loss_reduction, # config=config # ) # DNN 测试 - ADANET estimator = adanet.Estimator( head=head, subnetwork_generator=simple_dnn.Generator( layer_size=128, initial_num_layers=3, dropout=0.2, feature_columns=feature_columns, optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator( input_fn=input_fn("train"), steps=None ), config=config ) results, _ = tf.estimator.train_and_evaluate( estimator, train_spec=tf.estimator.TrainSpec( input_fn=input_fn("train"), max_steps=TRAIN_STEPS), eval_spec=tf.estimator.EvalSpec( input_fn=input_fn("test"), steps=None) ) print("Accuracy:", results["accuracy"]) print("Loss:", results["average_loss"]) message = "Accuracy: {}; Loss: {}".format(results["accuracy"], results["average_loss"]) arch = results["architecture/adanet/ensembles"] summary_proto = tf.summary.Summary.FromString(arch) arch_result = summary_proto.value[0].tensor.string_val[0] print("==============================================") print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat())) # WORKAROUND for https://github.com/tensorflow/tensorflow/issues/21745 # wait for all other nodes to complete (via done files) done_dir = "{}/{}/done".format(ctx.absolute_path(args.log_dir), args.mode) print("Writing done file to: {}".format(done_dir)) tf.gfile.MakeDirs(done_dir) with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: done_file.write(message) done_file.write(arch_result) for i in range(30): if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']): print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i)) time.sleep(1) else: print("{} All nodes done".format(datetime.now().isoformat())) break
def dnn_ada(): print("==============================================") start = datetime.datetime.now() print("Start Train Adanet with [DNN Model] on Criteo at %s" % time_str(start)) print("- - - - - - - - - - - - - - - - - - - - - - - -") # 根据论文参数调整 LEARNING_RATE = LR model_dir = os.path.join(LOG_DIR, "dnn_%s" % time_str(start)) result_file = os.path.join(RESULT_DIR, "dnn_%s" % time_str(start)) valid_file = os.path.join(RESULT_DIR, "valid_%s" % time_str(start)) test_file = os.path.join(RESULT_DIR, "test_%s" % time_str(start)) tpred_file = os.path.join(RESULT_DIR, "tpred_%s" % time_str(start)) vpred_file = os.path.join(RESULT_DIR, "vpred_%s" % time_str(start)) config = tf.estimator.RunConfig( save_checkpoints_steps=50000, save_summary_steps=50000, tf_random_seed=RANDOM_SEED, model_dir=model_dir ) # layer size 125 256 512 estimator = adanet.Estimator( head=head, subnetwork_generator=simple_dnn.Generator( feature_columns=feature_columns, layer_size=LS, optimizer=tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE), seed=RANDOM_SEED), max_iteration_steps=TRAIN_STEPS // ADANET_ITERATIONS, evaluator=adanet.Evaluator( input_fn=input_fn("train"), steps=None), config=config ) results, _ = tf.estimator.train_and_evaluate( estimator, train_spec=tf.estimator.TrainSpec( input_fn=input_fn("train"), max_steps=TRAIN_STEPS), eval_spec=tf.estimator.EvalSpec( input_fn=input_fn("test"), steps=None) ) print("Accuracy:", results["accuracy"]) print("AUC", results["auc"]) print("Loss:", results["average_loss"]) # 重新获取评测结果 train_spec = estimator.evaluate(input_fn=input_fn("train")) test_spec = estimator.evaluate(input_fn=input_fn("test")) end = datetime.datetime.now() print("Training end at %s" % time_str(end)) print("Time Spend %s" % str(end - start)) print("==============================================") with open('{}.txt'.format(result_file), 'w') as f: f.write('Train Configs:\n') f.write('[Layer Size] {}\n'.format(LS)) f.write('[Learning Rate] {}\n'.format(LR)) f.write('[BATCH SIZE] {}\n'.format(BATCH_SIZE)) f.write('[Train Step] {}\n'.format(TRAIN_STEPS)) f.write('[Adanet Iteration] {}\n'.format(ADANET_ITERATIONS)) f.write('\nResults:\n') f.write('[Accurary] {}\n'.format(results["accuracy"])) f.write('[AUC] {}\n'.format(results["auc"])) f.write('[Loss] {}\n'.format(results["average_loss"])) f.write('[Time Spend] {}\n'.format(str(end - start))) f.write('[Train Spec] {}\n'.format(str(train_spec))) f.write('[Test Spec] {}\n'.format(str(test_spec))) # 写入测试集 print("export test data") test.to_csv('{}.txt'.format(test_file)) print("export train data") train.to_csv('{}.txt'.format(valid_file)) # 进行预测 predictions = estimator.predict(input_fn=input_fn("test")) # 写入预测集 with open('{}.txt'.format(tpred_file), 'w') as f: for pred in predictions: f.write(str(pred)) f.write('\n') # 进行预测并写入预测集 predictions = estimator.predict(input_fn=input_fn("valid")) # 写入预测集 with open('{}.txt'.format(vpred_file), 'w') as f: for pred in predictions: f.write(str(pred)) f.write('\n')