Пример #1
0
def run(args):
    if args.verbosity == 0:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.DEBUG)
    
    assert args.role in ['leader', 'follower', 'local'], \
        "role must be leader, follower, or local"
    assert args.mode in ['train', 'test', 'eval'], \
        "mode must be train, test, or eval"
    #follower或leader
    if args.role != 'local':
        bridge = Bridge(args.role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, 0,
                        streaming_mode=args.use_streaming)
    else:
        bridge = None

    try:
        #boost
        booster = BoostingTreeEnsamble(
            bridge,
            learning_rate=args.learning_rate,
            max_iters=args.max_iters,
            max_depth=args.max_depth,
            l2_regularization=args.l2_regularization,
            max_bins=args.max_bins,
            num_parallel=args.num_parallel,
            loss_type=args.loss_type,
            send_scores_to_follower=args.send_scores_to_follower,
            send_metrics_to_follower=args.send_metrics_to_follower)
        #加载已存储的模型
        if args.load_model_path:
            booster.load_saved_model(args.load_model_path)
        #训练不需要bridge,为什么呢
        if args.mode == 'train':
            train(args, booster)
        #测试,评估模型需要bridge
        else:  # args.mode == 'test, eval'
            test(args, bridge, booster)
        #把模型存起来
        if args.export_path:
            booster.save_model(args.export_path)
    except Exception as e:
        logging.fatal(
            'Exception raised during training: %s',
            traceback.format_exc())
        raise e
    finally:
        #结束bridge
        if bridge:
            bridge.terminate()
Пример #2
0
def _run_local(role,
               args,
               input_fn,
               model_fn,
               serving_input_receiver_fn,
               export_model_hook=None):
    if not args.local_addr:
        raise ValueError("local-addr is required")
    if not args.peer_addr:
        raise ValueError("peer-addr is required")
    mode = args.mode.lower()

    cluster_spec = _create_cluster_spec(args)
    cluster_server = ClusterServer(cluster_spec, "local")

    # run master
    checkpoint_filename_with_path = _get_checkpoint_filename_with_path(args)
    data_visitor = _create_data_visitor(args)
    master_factory = LeaderTrainerMaster \
        if role == LEADER else FollowerTrainerMaster
    local_master = master_factory(
        cluster_server,
        data_visitor,
        mode,
        model_fn,
        input_fn,
        serving_input_receiver_fn,
        checkpoint_filename_with_path,
        checkpoint_path=args.checkpoint_path,
        save_checkpoint_steps=args.save_checkpoint_steps,
        save_checkpoint_secs=args.save_checkpoint_secs,
        summary_path=args.summary_path,
        summary_save_steps=args.summary_save_steps,
        summary_save_secs=args.summary_save_secs,
        export_path=args.export_path,
        sparse_estimator=args.sparse_estimator,
        export_model_hook=export_model_hook)
    master_thread = threading.Thread(target=local_master.run_forever)
    master_thread.setDaemon(True)
    master_thread.start()

    # run worker
    trainer_master = LocalTrainerMasterClient(local_master, 0)
    if not trainer_master.worker_register():
        return
    bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr,
                    args.application_id, 0)

    estimator_factory = \
        SparseFLEstimator if args.sparse_estimator else FLEstimator
    estimator = estimator_factory(cluster_server, trainer_master, bridge, role,
                                  model_fn)

    if mode == 'train':
        estimator.train(input_fn)
    elif mode == 'eval':
        estimator.evaluate(input_fn)

    trainer_master.worker_complete(bridge.terminated_at)
    trainer_master.wait_master_complete()
Пример #3
0
def train(args):
    if args.verbosity == 0:
        logging.basicConfig(level=logging.WARNING)
    elif args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.DEBUG)

    assert args.role in ['leader', 'follower', 'local'], \
        "role must be leader, follower, or local"
    assert args.mode in ['train', 'test', 'eval'], \
        "mode must be train, test, or eval"

    if args.data_path.endswith('.csv'):
        with open(args.data_path, 'rb') as fin:
            data = np.loadtxt(fin, delimiter=',')
        if args.mode == 'train' or args.mode == 'test':
            if args.role == 'leader' or args.role == 'local':
                X = data[:, :-1]
                y = data[:, -1]
            else:
                X = data
                y = None
        else:  # eval
            X = data
            y = None
    else:
        raise ValueError("Unsupported data type %s" % args.data_path)

    if args.role != 'local':
        bridge = Bridge(args.role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, 0)
    else:
        bridge = None

    booster = BoostingTreeEnsamble(bridge,
                                   learning_rate=args.learning_rate,
                                   max_iters=args.max_iters,
                                   max_depth=args.max_depth,
                                   l2_regularization=args.l2_regularization,
                                   max_bins=args.max_bins,
                                   num_parallel=args.num_parallel)

    if args.load_model_path:
        booster.load_saved_model(args.load_model_path)

    if args.mode == 'train':
        booster.fit(X, y, args.checkpoint_path)
    elif args.mode == 'test':
        pred = booster.batch_predict(X)
        acc = sum((pred > 0.5) == y) / len(y)
        logging.info("Test accuracy: %f", acc)
    else:
        pred = booster.batch_predict(X)
        for i in pred:
            print(i)

    if args.export_path:
        booster.save_model(args.export_path)
Пример #4
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                               args.peer_addr)

    if args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps': cluster_spec['clusterSpec']['PS'],
            'worker': {args.worker_rank: args.tf_addr}})

    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(
            args.master_addr, role, args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {args.worker_rank: args.tf_addr}})
    elif args.data_path:
        trainer_master = LocalTrainerMasterClient(role, args.data_path)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    estimator = FLEstimator(
        model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank,
        cluster_spec=cluster_spec)
    if args.checkpoint_path:
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps)
    else:
        estimator.train(input_fn)

    if args.export_path:
        estimator.export_saved_model(args.export_path,
                                    serving_input_receiver_fn,
                                    checkpoint_path=args.checkpoint_path)
Пример #5
0
def _run_worker(role, args, input_fn, model_fn):
    if not args.local_addr:
        raise ValueError("local-addr is required")
    if not args.peer_addr:
        raise ValueError("peer-addr is required")
    if not args.master_addr:
        raise ValueError("master-addr is required")
    mode = args.mode.lower()

    cluster_spec = _create_cluster_spec(args, require_ps=True)
    cluster_server = ClusterServer(cluster_spec,
                                   "worker",
                                   task_index=args.worker_rank)

    trainer_master = TrainerMasterClient(args.master_addr, args.worker_rank)
    if not trainer_master.worker_register(cluster_spec.as_cluster_def()):
        return

    bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr,
                    args.application_id, args.worker_rank)

    estimator_factory = SparseFLEstimator \
        if args.sparse_estimator else FLEstimator
    estimator = estimator_factory(cluster_server,
                                  trainer_master,
                                  bridge,
                                  role,
                                  model_fn,
                                  is_chief=args.worker_rank == 0)

    if mode == 'train':
        estimator.train(input_fn)
    elif mode == 'eval':
        estimator.evaluate(input_fn)

    trainer_master.worker_complete(bridge.terminated_at)
    trainer_master.wait_master_complete()
Пример #6
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    logging.basicConfig(
        format="%(asctime)-15s [%(filename)s:%(lineno)d] " \
               "%(levelname)s : %(message)s")
    if args.verbosity == 0:
        logging.getLogger().setLevel(logging.WARNING)
    elif args.verbosity == 1:
        logging.getLogger().setLevel(logging.INFO)
    elif args.verbosity > 1:
        logging.getLogger().setLevel(logging.DEBUG)

    if args.application_id:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, args.worker_rank)
    else:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr)

    if args.data_path:
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_path,
                                                  epoch_num=args.epoch_num)
        if args.ps_addrs is not None:
            ps_addrs = args.ps_addrs.split(",")
            cluster_spec = tf.train.ClusterSpec({
                'ps': ps_addrs,
                'worker': {
                    args.worker_rank: args.tf_addr
                }
            })
        else:
            cluster_spec = None
    elif args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps':
            cluster_spec['clusterSpec']['PS'],
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(args.master_addr, role,
                                             args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.data_source:
        if args.start_time is None or args.end_time is None:
            raise ValueError(
                "data source must be set with start-date and end-date")
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_source,
                                                  start_time=args.start_time,
                                                  end_time=args.end_time,
                                                  epoch_num=args.epoch_num)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    if args.summary_path:
        SummaryHook.summary_path = args.summary_path
        SummaryHook.worker_rank = args.worker_rank
        SummaryHook.role = role
    if args.summary_save_steps:
        SummaryHook.save_steps = args.summary_save_steps

    if args.sparse_estimator:
        estimator = SparseFLEstimator(model_fn,
                                      bridge,
                                      trainer_master,
                                      role,
                                      worker_rank=args.worker_rank,
                                      application_id=args.application_id,
                                      cluster_spec=cluster_spec)
    else:
        estimator = FLEstimator(model_fn,
                                bridge,
                                trainer_master,
                                role,
                                worker_rank=args.worker_rank,
                                application_id=args.application_id,
                                cluster_spec=cluster_spec)

    run_mode = args.mode.lower()
    if run_mode == 'train':
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps,
                        save_checkpoint_secs=args.save_checkpoint_secs)
        if args.export_path and args.worker_rank == 0:
            export_path = '%s/%d' % (args.export_path, bridge.terminated_at)
            estimator.export_saved_model(export_path,
                                         serving_input_receiver_fn,
                                         checkpoint_path=args.checkpoint_path)
            fsuccess = tf.io.gfile.GFile('%s/_SUCCESS' % export_path, 'w')
            fsuccess.write('%d' % bridge.terminated_at)
            fsuccess.close()

    elif run_mode == 'eval':
        estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path)
    else:
        raise ValueError('Allowed values are: --mode=train|eval')
Пример #7
0
    def test_bridge(self):
        bridge1 = Bridge('leader', 49951, 'localhost:49952')
        bridge2 = Bridge('follower', 49952, 'localhost:49951')

        t = threading.Thread(target=lambda _: bridge1.connect(), args=(None, ))
        t.start()
        bridge2.connect()
        t.join()

        g1 = tf.Graph()
        with g1.as_default():
            x = tf.constant(3.0, name='x')
            y = tf.constant(2.0, name='y')
            send_x = bridge1.send_op('x', x)
            send_y = bridge1.send_op('y', y)

        g2 = tf.Graph()
        with g2.as_default():
            recv_x = bridge2.receive_op('x', dtype=tf.float32)
            recv_y = bridge2.receive_op('y', dtype=tf.float32)
            out = recv_x - recv_y

        bridge1.start()
        bridge2.start()
        with tf.Session(graph=g1) as sess:
            sess.run([send_x, send_y])
        with tf.Session(graph=g2) as sess:
            self.assertEqual(sess.run(out), 1.0)
        bridge1.commit()
        bridge2.commit()

        time.sleep(3)

        t = threading.Thread(target=lambda _: bridge1.terminate(),
                             args=(None, ))
        t.start()
        bridge2.terminate()
        t.join()
Пример #8
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    if args.application_id:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, args.worker_rank)
    else:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr)

    if args.data_path:
        trainer_master = LocalTrainerMasterClient(role, args.data_path)
        if args.ps_addrs is not None:
            ps_addrs = args.ps_addrs.split(",")
            cluster_spec = tf.train.ClusterSpec({
                'ps': ps_addrs,
                'worker': {
                    args.worker_rank: args.tf_addr
                }
            })
        else:
            cluster_spec = None
    elif args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps':
            cluster_spec['clusterSpec']['PS'],
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(args.master_addr, role,
                                             args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.data_source:
        if args.start_time is None or args.end_time is None:
            raise ValueError(
                "data source must be set with start-date and end-date")
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_source,
                                                  start_time=args.start_time,
                                                  end_time=args.end_time)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    if args.summary_path:
        SummaryHook.summary_path = args.summary_path
        SummaryHook.worker_rank = args.worker_rank
        SummaryHook.role = role
    if args.summary_save_steps:
        SummaryHook.save_steps = args.summary_save_steps

    if args.sparse_estimator:
        estimator = SparseFLEstimator(model_fn,
                                      bridge,
                                      trainer_master,
                                      role,
                                      worker_rank=args.worker_rank,
                                      cluster_spec=cluster_spec)
    else:
        estimator = FLEstimator(model_fn,
                                bridge,
                                trainer_master,
                                role,
                                worker_rank=args.worker_rank,
                                cluster_spec=cluster_spec)

    run_mode = args.mode.lower()
    if run_mode == 'train':
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps,
                        save_checkpoint_secs=args.save_checkpoint_secs)
    elif run_mode == 'eval':
        estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path)
    else:
        raise ValueError('Allowed values are: --mode=train|eval')

    if args.export_path:
        estimator.export_saved_model(args.export_path,
                                     serving_input_receiver_fn,
                                     checkpoint_path=args.checkpoint_path)