def main(argv=None): # pylint: disable=unused-argument algo = FLAGS.model eprint(algo) field_sizes = None if algo == 'fmuv': params = { 'data_dir': FLAGS.data_dir, 'num_epochs': FLAGS.num_epochs, 'batch_size': FLAGS.batch_size, 'input_dim': FLAGS.input_dim, 'factor_order': 12, 'l2_w': 0.001, } eprint(params) model = FMUV(**params) elif algo == 'fnn': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'layer_sizes': [field_sizes, 12, 200, 1], 'layer_acts': ['none', 'tanh', 'none'], 'layer_l2': [0, 0, 0], 'l2_w': 0.001, } eprint(params) model = FNN(**params) elif algo == 'pnn1': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'layer_sizes': [field_sizes, 12, 1], 'layer_acts': ['tanh', 'none'], 'layer_l2': [0, 0], 'kernel_l2': 0, 'l2_w': 0.001, } eprint(params) model = PNN1(**params) X, y, B = worker_input(field_sizes=field_sizes) eval_once(model, X, y, B)
} model = CCPM(**ccpm_params) elif algo == 'pnn1': pnn1_params = { 'layer_sizes': [field_sizes, 10, 1], 'layer_acts': ['tanh', 'none'], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'layer_l2': [0, 0], 'kernel_l2': 0, 'random_seed': 0 } model = PNN1(**pnn1_params) elif algo == 'pnn2': pnn2_params = { 'layer_sizes': [field_sizes, 10, 1], 'layer_acts': ['tanh', 'none'], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.01, 'layer_l2': [0, 0], 'kernel_l2': 0, 'random_seed': 0 } model = PNN2(**pnn2_params) if algo in {'fnn', 'ccpm', 'pnn1', 'pnn2'}:
def train(): dataset = Data("./input/3358/train.txt", "./input/3358/featindex.txt") # loader = DataLoader(dataset, 128, True, num_workers=4) pnn1 = PNN1(dataset.field_sizes, dataset.feature_sizes) pnn1.fit(dataset, save_path="./output/model.pk")
def worker_process(cluster, server): # assign ops to local worker by default with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): ps_num = cluster.num_tasks('ps') worker_num = cluster.num_tasks('worker') algo = FLAGS.model eprint(algo) field_sizes = None if algo == 'fmuv': params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'factor_order': 12, 'l2_w': 0.001, } eprint(params) model = FMUV(**params) elif algo == 'fnn': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'layer_sizes': [field_sizes, 12, 200, 1], 'layer_acts': ['none', 'tanh', 'none'], 'drop_out': [0, 0, 0], 'layer_l2': [0, 0, 0], 'l2_w': 0.001, } eprint(params) model = FNN(**params) elif algo == 'pnn1': field_sizes = [FLAGS.input_dim] * FLAGS.num_field params = { 'data_dir': FLAGS.data_dir, 'summary_dir': FLAGS.train_dir, 'eval_dir': FLAGS.eval_dir, 'random_seed': FLAGS.task_index, 'batch_size': FLAGS.batch_size, 'num_epochs': FLAGS.num_epochs, 'input_dim': FLAGS.input_dim, 'learning_rate': FLAGS.learning_rate, 'opt_algo': FLAGS.optimizer, #'adagrad', 'sync': FLAGS.sync_replicas, 'workers': FLAGS.all_workers, 'layer_sizes': [field_sizes, 12, 1], 'layer_acts': ['tanh', 'none'], 'layer_l2': [0, 0], 'kernel_l2': 0, 'l2_w': 0.001, } eprint(params) model = PNN1(**params) worker_device = "/job:worker/task:%d" % FLAGS.task_index with tf.device(worker_device): X, y, B = worker_input(field_sizes=field_sizes) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. #summary_writer = tf.summary.FileWriter(FLAGS.log_dir, model.graph) saver = tf.train.Saver(var_list=model.vars, max_to_keep=FLAGS.max_models_to_keep) save_interval = 100 if FLAGS.model == "fmuv" else 600 def load_pretrained_model(sess): restore_file = tf.train.latest_checkpoint(FLAGS.resume_dir) eprint('restore:', restore_file) saver.restore(sess, restore_file) load_model_function = load_pretrained_model if FLAGS.resume_dir != '' else None is_chief = (FLAGS.task_index == 0) # Create a "supervisor", which oversees the training process. sv = tf.train.Supervisor(is_chief=is_chief, logdir=FLAGS.train_dir, saver=saver, init_fn=load_model_function, global_step=model.global_step, save_model_secs=save_interval) retry_times = 0 N_failed = 10 while retry_times < N_failed: try: eprint('retry_times = %d' % (retry_times)) startt = time.time() with sv.managed_session(master=server.target) as sess: eprint('------ start ------', datetime.now()) if is_chief: time.sleep(10) run_while_batch(sv, sess, model, X, y, B) sv.stop() eprint("------ end sv stop:", datetime.now()) endt = time.time() if endt - startt > 300: retry_times = N_failed else: time.sleep(10) retry_times += 1 except: traceback.print_exc() retry_times += 1 time.sleep(10)
if algo == 'pnn1': fnn_params = { 'field_size': field_sizes, 'embed_size': 129, 'layer_sizes': [500, 1], 'layer_acts': ['relu', None], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'embed_l2': 0, 'layer_l2': [0, 0], 'random_seed': 0, } print(fnn_params) model = PNN1(**fnn_params) elif algo == 'pnn2': pnn2_params = { 'field_size': field_sizes, 'embed_size': 10, 'layer_sizes': [500, 1], 'layer_acts': ['relu', None], 'drop_out': [0, 0], 'opt_algo': 'gd', 'learning_rate': 0.1, 'embed_l2': 0, 'layer_l2': [0., 0.], 'random_seed': 0, 'layer_norm': True, } print(pnn2_params)