def run(args): if args.verbosity == 0: logging.basicConfig(level=logging.WARNING) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.DEBUG) assert args.role in ['leader', 'follower', 'local'], \ "role must be leader, follower, or local" assert args.mode in ['train', 'test', 'eval'], \ "mode must be train, test, or eval" #follower或leader if args.role != 'local': bridge = Bridge(args.role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, 0, streaming_mode=args.use_streaming) else: bridge = None try: #boost booster = BoostingTreeEnsamble( bridge, learning_rate=args.learning_rate, max_iters=args.max_iters, max_depth=args.max_depth, l2_regularization=args.l2_regularization, max_bins=args.max_bins, num_parallel=args.num_parallel, loss_type=args.loss_type, send_scores_to_follower=args.send_scores_to_follower, send_metrics_to_follower=args.send_metrics_to_follower) #加载已存储的模型 if args.load_model_path: booster.load_saved_model(args.load_model_path) #训练不需要bridge,为什么呢 if args.mode == 'train': train(args, booster) #测试,评估模型需要bridge else: # args.mode == 'test, eval' test(args, bridge, booster) #把模型存起来 if args.export_path: booster.save_model(args.export_path) except Exception as e: logging.fatal( 'Exception raised during training: %s', traceback.format_exc()) raise e finally: #结束bridge if bridge: bridge.terminate()
def _run_local(role, args, input_fn, model_fn, serving_input_receiver_fn, export_model_hook=None): if not args.local_addr: raise ValueError("local-addr is required") if not args.peer_addr: raise ValueError("peer-addr is required") mode = args.mode.lower() cluster_spec = _create_cluster_spec(args) cluster_server = ClusterServer(cluster_spec, "local") # run master checkpoint_filename_with_path = _get_checkpoint_filename_with_path(args) data_visitor = _create_data_visitor(args) master_factory = LeaderTrainerMaster \ if role == LEADER else FollowerTrainerMaster local_master = master_factory( cluster_server, data_visitor, mode, model_fn, input_fn, serving_input_receiver_fn, checkpoint_filename_with_path, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs, summary_path=args.summary_path, summary_save_steps=args.summary_save_steps, summary_save_secs=args.summary_save_secs, export_path=args.export_path, sparse_estimator=args.sparse_estimator, export_model_hook=export_model_hook) master_thread = threading.Thread(target=local_master.run_forever) master_thread.setDaemon(True) master_thread.start() # run worker trainer_master = LocalTrainerMasterClient(local_master, 0) if not trainer_master.worker_register(): return bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, 0) estimator_factory = \ SparseFLEstimator if args.sparse_estimator else FLEstimator estimator = estimator_factory(cluster_server, trainer_master, bridge, role, model_fn) if mode == 'train': estimator.train(input_fn) elif mode == 'eval': estimator.evaluate(input_fn) trainer_master.worker_complete(bridge.terminated_at) trainer_master.wait_master_complete()
def train(args): if args.verbosity == 0: logging.basicConfig(level=logging.WARNING) elif args.verbosity == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.DEBUG) assert args.role in ['leader', 'follower', 'local'], \ "role must be leader, follower, or local" assert args.mode in ['train', 'test', 'eval'], \ "mode must be train, test, or eval" if args.data_path.endswith('.csv'): with open(args.data_path, 'rb') as fin: data = np.loadtxt(fin, delimiter=',') if args.mode == 'train' or args.mode == 'test': if args.role == 'leader' or args.role == 'local': X = data[:, :-1] y = data[:, -1] else: X = data y = None else: # eval X = data y = None else: raise ValueError("Unsupported data type %s" % args.data_path) if args.role != 'local': bridge = Bridge(args.role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, 0) else: bridge = None booster = BoostingTreeEnsamble(bridge, learning_rate=args.learning_rate, max_iters=args.max_iters, max_depth=args.max_depth, l2_regularization=args.l2_regularization, max_bins=args.max_bins, num_parallel=args.num_parallel) if args.load_model_path: booster.load_saved_model(args.load_model_path) if args.mode == 'train': booster.fit(X, y, args.checkpoint_path) elif args.mode == 'test': pred = booster.batch_predict(X) acc = sum((pred > 0.5) == y) / len(y) logging.info("Test accuracy: %f", acc) else: pred = booster.batch_predict(X) for i in pred: print(i) if args.export_path: booster.save_model(args.export_path)
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': {args.worker_rank: args.tf_addr}}) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient( args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': {args.worker_rank: args.tf_addr}}) elif args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") estimator = FLEstimator( model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) if args.checkpoint_path: estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps) else: estimator.train(input_fn) if args.export_path: estimator.export_saved_model(args.export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path)
def _run_worker(role, args, input_fn, model_fn): if not args.local_addr: raise ValueError("local-addr is required") if not args.peer_addr: raise ValueError("peer-addr is required") if not args.master_addr: raise ValueError("master-addr is required") mode = args.mode.lower() cluster_spec = _create_cluster_spec(args, require_ps=True) cluster_server = ClusterServer(cluster_spec, "worker", task_index=args.worker_rank) trainer_master = TrainerMasterClient(args.master_addr, args.worker_rank) if not trainer_master.worker_register(cluster_spec.as_cluster_def()): return bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, args.worker_rank) estimator_factory = SparseFLEstimator \ if args.sparse_estimator else FLEstimator estimator = estimator_factory(cluster_server, trainer_master, bridge, role, model_fn, is_chief=args.worker_rank == 0) if mode == 'train': estimator.train(input_fn) elif mode == 'eval': estimator.evaluate(input_fn) trainer_master.worker_complete(bridge.terminated_at) trainer_master.wait_master_complete()
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): logging.basicConfig( format="%(asctime)-15s [%(filename)s:%(lineno)d] " \ "%(levelname)s : %(message)s") if args.verbosity == 0: logging.getLogger().setLevel(logging.WARNING) elif args.verbosity == 1: logging.getLogger().setLevel(logging.INFO) elif args.verbosity > 1: logging.getLogger().setLevel(logging.DEBUG) if args.application_id: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, args.worker_rank) else: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path, epoch_num=args.epoch_num) if args.ps_addrs is not None: ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) else: cluster_spec = None elif args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': { args.worker_rank: args.tf_addr } }) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient(args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) elif args.data_source: if args.start_time is None or args.end_time is None: raise ValueError( "data source must be set with start-date and end-date") trainer_master = LocalTrainerMasterClient(role, args.data_source, start_time=args.start_time, end_time=args.end_time, epoch_num=args.epoch_num) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") if args.summary_path: SummaryHook.summary_path = args.summary_path SummaryHook.worker_rank = args.worker_rank SummaryHook.role = role if args.summary_save_steps: SummaryHook.save_steps = args.summary_save_steps if args.sparse_estimator: estimator = SparseFLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, application_id=args.application_id, cluster_spec=cluster_spec) else: estimator = FLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, application_id=args.application_id, cluster_spec=cluster_spec) run_mode = args.mode.lower() if run_mode == 'train': estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs) if args.export_path and args.worker_rank == 0: export_path = '%s/%d' % (args.export_path, bridge.terminated_at) estimator.export_saved_model(export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path) fsuccess = tf.io.gfile.GFile('%s/_SUCCESS' % export_path, 'w') fsuccess.write('%d' % bridge.terminated_at) fsuccess.close() elif run_mode == 'eval': estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path) else: raise ValueError('Allowed values are: --mode=train|eval')
def test_bridge(self): bridge1 = Bridge('leader', 49951, 'localhost:49952') bridge2 = Bridge('follower', 49952, 'localhost:49951') t = threading.Thread(target=lambda _: bridge1.connect(), args=(None, )) t.start() bridge2.connect() t.join() g1 = tf.Graph() with g1.as_default(): x = tf.constant(3.0, name='x') y = tf.constant(2.0, name='y') send_x = bridge1.send_op('x', x) send_y = bridge1.send_op('y', y) g2 = tf.Graph() with g2.as_default(): recv_x = bridge2.receive_op('x', dtype=tf.float32) recv_y = bridge2.receive_op('y', dtype=tf.float32) out = recv_x - recv_y bridge1.start() bridge2.start() with tf.Session(graph=g1) as sess: sess.run([send_x, send_y]) with tf.Session(graph=g2) as sess: self.assertEqual(sess.run(out), 1.0) bridge1.commit() bridge2.commit() time.sleep(3) t = threading.Thread(target=lambda _: bridge1.terminate(), args=(None, )) t.start() bridge2.terminate() t.join()
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): if args.application_id: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, args.worker_rank) else: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path) if args.ps_addrs is not None: ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) else: cluster_spec = None elif args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': { args.worker_rank: args.tf_addr } }) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient(args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) elif args.data_source: if args.start_time is None or args.end_time is None: raise ValueError( "data source must be set with start-date and end-date") trainer_master = LocalTrainerMasterClient(role, args.data_source, start_time=args.start_time, end_time=args.end_time) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") if args.summary_path: SummaryHook.summary_path = args.summary_path SummaryHook.worker_rank = args.worker_rank SummaryHook.role = role if args.summary_save_steps: SummaryHook.save_steps = args.summary_save_steps if args.sparse_estimator: estimator = SparseFLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) else: estimator = FLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) run_mode = args.mode.lower() if run_mode == 'train': estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs) elif run_mode == 'eval': estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path) else: raise ValueError('Allowed values are: --mode=train|eval') if args.export_path: estimator.export_saved_model(args.export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path)