def __init__(self, role, bridge, data_path, ext, worker_rank=0, num_workers=1): self._role = role self._bridge = bridge self._num_workers = num_workers self._worker_rank = worker_rank self._tm_role = 'follower' if role == 'leader' else 'leader' if data_path: files = None if not tf.io.gfile.isdir(data_path): files = [os.path.basename(data_path)] data_path = os.path.dirname(data_path) self._trainer_master = LocalTrainerMasterClient( self._tm_role, data_path, files=files, ext=ext) else: self._trainer_master = None self._count = 0 if self._role == 'leader': self._block_queue = queue.Queue() self._bridge.register_data_block_handler(self._data_block_handler) self._bridge.start(self._bridge.new_iter_id()) self._bridge.send( self._bridge.current_iter_id, 'barrier', np.asarray([1])) self._bridge.commit() elif self._role == 'follower': self._bridge.start(self._bridge.new_iter_id()) self._bridge.receive(self._bridge.current_iter_id, 'barrier') self._bridge.commit()
def _run_local(role, args, input_fn, model_fn, serving_input_receiver_fn, export_model_hook=None): if not args.local_addr: raise ValueError("local-addr is required") if not args.peer_addr: raise ValueError("peer-addr is required") mode = args.mode.lower() cluster_spec = _create_cluster_spec(args) cluster_server = ClusterServer(cluster_spec, "local") # run master checkpoint_filename_with_path = _get_checkpoint_filename_with_path(args) data_visitor = _create_data_visitor(args) master_factory = LeaderTrainerMaster \ if role == LEADER else FollowerTrainerMaster local_master = master_factory( cluster_server, data_visitor, mode, model_fn, input_fn, serving_input_receiver_fn, checkpoint_filename_with_path, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs, summary_path=args.summary_path, summary_save_steps=args.summary_save_steps, summary_save_secs=args.summary_save_secs, export_path=args.export_path, sparse_estimator=args.sparse_estimator, export_model_hook=export_model_hook) master_thread = threading.Thread(target=local_master.run_forever) master_thread.setDaemon(True) master_thread.start() # run worker trainer_master = LocalTrainerMasterClient(local_master, 0) if not trainer_master.worker_register(): return bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, 0) estimator_factory = \ SparseFLEstimator if args.sparse_estimator else FLEstimator estimator = estimator_factory(cluster_server, trainer_master, bridge, role, model_fn) if mode == 'train': estimator.train(input_fn) elif mode == 'eval': estimator.evaluate(input_fn) trainer_master.worker_complete(bridge.terminated_at) trainer_master.wait_master_complete()
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': {args.worker_rank: args.tf_addr}}) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient( args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': {args.worker_rank: args.tf_addr}}) elif args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") estimator = FLEstimator( model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) if args.checkpoint_path: estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps) else: estimator.train(input_fn) if args.export_path: estimator.export_saved_model(args.export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path)
class DataBlockLoader(object): def __init__(self, role, bridge, data_path, ext, worker_rank=0, num_workers=1, output_path=None): self._role = role self._bridge = bridge self._num_workers = num_workers self._worker_rank = worker_rank self._output_path = output_path self._tm_role = 'follower' if role == 'leader' else 'leader' if data_path: files = None if not tf.io.gfile.isdir(data_path): files = [os.path.basename(data_path)] data_path = os.path.dirname(data_path) self._trainer_master = LocalTrainerMasterClient(self._tm_role, data_path, files=files, ext=ext) else: self._trainer_master = None self._count = 0 if self._role == 'leader': self._block_queue = queue.Queue() self._bridge.register_data_block_handler(self._data_block_handler) self._bridge.start(self._bridge.new_iter_id()) self._bridge.send(self._bridge.current_iter_id, 'barrier', np.asarray([1])) self._bridge.commit() elif self._role == 'follower': self._bridge.start(self._bridge.new_iter_id()) self._bridge.receive(self._bridge.current_iter_id, 'barrier') self._bridge.commit() def _data_block_handler(self, msg): logging.debug('DataBlock: recv "%s" at %d', msg.block_id, msg.count) assert self._count == msg.count if not msg.block_id: block = None elif self._trainer_master is not None: block = self._trainer_master.request_data_block(msg.block_id) return False else: block = DataBlockInfo(msg.block_id, None) self._count += 1 self._block_queue.put(block) return True def _request_data_block(self): while True: for _ in range(self._worker_rank): self._trainer_master.request_data_block() block = self._trainer_master.request_data_block() for _ in range(self._num_workers - self._worker_rank - 1): self._trainer_master.request_data_block() if block is None or self._output_path is None or \ not tf.io.gfile.exists(os.path.join( self._output_path, block.block_id) + '.output'): break return block def get_next_block(self): if self._role == 'local': return self._request_data_block() if self._tm_role == 'leader': while True: block = self._request_data_block() if block is not None: if not self._bridge.load_data_block( self._count, block.block_id): continue else: self._bridge.load_data_block(self._count, '') break self._count += 1 else: block = self._block_queue.get() return block
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): logging.basicConfig( format="%(asctime)-15s [%(filename)s:%(lineno)d] " \ "%(levelname)s : %(message)s") if args.verbosity == 0: logging.getLogger().setLevel(logging.WARNING) elif args.verbosity == 1: logging.getLogger().setLevel(logging.INFO) elif args.verbosity > 1: logging.getLogger().setLevel(logging.DEBUG) if args.application_id: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, args.worker_rank) else: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path, epoch_num=args.epoch_num) if args.ps_addrs is not None: ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) else: cluster_spec = None elif args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': { args.worker_rank: args.tf_addr } }) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient(args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) elif args.data_source: if args.start_time is None or args.end_time is None: raise ValueError( "data source must be set with start-date and end-date") trainer_master = LocalTrainerMasterClient(role, args.data_source, start_time=args.start_time, end_time=args.end_time, epoch_num=args.epoch_num) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") if args.summary_path: SummaryHook.summary_path = args.summary_path SummaryHook.worker_rank = args.worker_rank SummaryHook.role = role if args.summary_save_steps: SummaryHook.save_steps = args.summary_save_steps if args.sparse_estimator: estimator = SparseFLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, application_id=args.application_id, cluster_spec=cluster_spec) else: estimator = FLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, application_id=args.application_id, cluster_spec=cluster_spec) run_mode = args.mode.lower() if run_mode == 'train': estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs) if args.export_path and args.worker_rank == 0: export_path = '%s/%d' % (args.export_path, bridge.terminated_at) estimator.export_saved_model(export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path) fsuccess = tf.io.gfile.GFile('%s/_SUCCESS' % export_path, 'w') fsuccess.write('%d' % bridge.terminated_at) fsuccess.close() elif run_mode == 'eval': estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path) else: raise ValueError('Allowed values are: --mode=train|eval')
def train(role, args, input_fn, model_fn, serving_input_receiver_fn): if args.application_id: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr, args.application_id, args.worker_rank) else: bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr) if args.data_path: trainer_master = LocalTrainerMasterClient(role, args.data_path) if args.ps_addrs is not None: ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) else: cluster_spec = None elif args.cluster_spec: cluster_spec = json.loads(args.cluster_spec) assert 'clusterSpec' in cluster_spec, \ "cluster_spec do not meet legal format" assert 'Master' in cluster_spec['clusterSpec'],\ "cluster_spec must include Master" assert isinstance(cluster_spec['clusterSpec']['Master'], list), \ "Master must be list" assert 'Worker' in cluster_spec['clusterSpec'],\ "cluster_spec must include Worker" assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \ "Worker must be list" trainer_master = TrainerMasterClient( cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank) cluster_spec = tf.train.ClusterSpec({ 'ps': cluster_spec['clusterSpec']['PS'], 'worker': { args.worker_rank: args.tf_addr } }) elif args.master_addr: assert args.tf_addr is not None, \ "--tf-addr must be set when master_addr is set." trainer_master = TrainerMasterClient(args.master_addr, role, args.worker_rank) ps_addrs = args.ps_addrs.split(",") cluster_spec = tf.train.ClusterSpec({ 'ps': ps_addrs, 'worker': { args.worker_rank: args.tf_addr } }) elif args.data_source: if args.start_time is None or args.end_time is None: raise ValueError( "data source must be set with start-date and end-date") trainer_master = LocalTrainerMasterClient(role, args.data_source, start_time=args.start_time, end_time=args.end_time) cluster_spec = None else: raise ValueError("Either --master-addr or --data-path must be set") if args.summary_path: SummaryHook.summary_path = args.summary_path SummaryHook.worker_rank = args.worker_rank SummaryHook.role = role if args.summary_save_steps: SummaryHook.save_steps = args.summary_save_steps if args.sparse_estimator: estimator = SparseFLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) else: estimator = FLEstimator(model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank, cluster_spec=cluster_spec) run_mode = args.mode.lower() if run_mode == 'train': estimator.train(input_fn, checkpoint_path=args.checkpoint_path, save_checkpoint_steps=args.save_checkpoint_steps, save_checkpoint_secs=args.save_checkpoint_secs) elif run_mode == 'eval': estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path) else: raise ValueError('Allowed values are: --mode=train|eval') if args.export_path: estimator.export_saved_model(args.export_path, serving_input_receiver_fn, checkpoint_path=args.checkpoint_path)
class DataBlockLoader(object): def __init__(self, role, bridge, data_path, ext, worker_rank=0, num_workers=1): self._role = role self._bridge = bridge self._num_workers = num_workers self._worker_rank = worker_rank self._tm_role = 'follower' if role == 'leader' else 'leader' if data_path: files = None if not tf.io.gfile.isdir(data_path): files = [os.path.basename(data_path)] data_path = os.path.dirname(data_path) self._trainer_master = LocalTrainerMasterClient( self._tm_role, data_path, files=files, ext=ext) else: self._trainer_master = None self._count = 0 if self._role == 'leader': self._block_queue = queue.Queue() self._bridge.register_data_block_handler(self._data_block_handler) self._bridge.start(self._bridge.new_iter_id()) self._bridge.send( self._bridge.current_iter_id, 'barrier', np.asarray([1])) self._bridge.commit() elif self._role == 'follower': self._bridge.start(self._bridge.new_iter_id()) self._bridge.receive(self._bridge.current_iter_id, 'barrier') self._bridge.commit() def _data_block_handler(self, msg): logging.debug('DataBlock: recv "%s" at %d', msg.block_id, msg.count) assert self._count == msg.count if not msg.block_id: block = None elif self._trainer_master is not None: block = self._trainer_master.request_data_block(msg.block_id) if block is None: raise ValueError("Block %s not found" % msg.block_id) else: block = DataBlockInfo(msg.block_id, None) self._count += 1 self._block_queue.put(block) def _request_data_block(self): for _ in range(self._worker_rank): self._trainer_master.request_data_block() block = self._trainer_master.request_data_block() for _ in range(self._num_workers - self._worker_rank - 1): self._trainer_master.request_data_block() return block def get_next_block(self): if self._role == 'local': return self._request_data_block() if self._tm_role == 'leader': while True: block = self._request_data_block() if block is not None: try: self._bridge.load_data_block(self._count, block.block_id) except Exception as e: # pylint: disable=broad-except logging.error('load data block %s with error: %s', block.block_id, repr(e)) continue else: self._bridge.load_data_block(self._count, '') break self._count += 1 else: block = self._block_queue.get() return block