예제 #1
0
    def test_barrier(self):
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_barrier")
            return

        gloo = fluid.core.Gloo()
        gloo.set_rank(0)
        gloo.set_size(1)
        gloo.set_prefix("123")
        gloo.set_iface("lo")
        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
        gloo.init()

        role = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_endpoints=["127.0.0.1:6003"],
            server_endpoints=["127.0.0.1:6001"])
        role._node_type_comm = gloo
        role._role_is_generated = True
        fleet_util._set_role_maker(role)

        fleet_util.barrier("worker")
예제 #2
0
    def test_all_gather(self):
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_all_gather")
            return

        gloo = fluid.core.Gloo()
        gloo.set_rank(0)
        gloo.set_size(1)
        gloo.set_prefix("123")
        gloo.set_iface("lo")
        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
        gloo.init()

        role = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_endpoints=["127.0.0.1:6003"],
            server_endpoints=["127.0.0.1:6001"])
        role._node_type_comm = gloo
        role._all_comm = gloo
        role._role_is_generated = True
        fleet_util._set_role_maker(role)

        output = fleet_util.all_gather(1, comm_world="all")
        print(output)
        # self.assertTrue(len(output) == 1 and output[0] == 1)
        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
예제 #3
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(current_id=0,
                                               role=role_maker.Role.SERVER,
                                               worker_num=2,
                                               server_endpoints=endpoints)

        fleet.init(role)
        loss, acc, _ = self.net()

        optimizer = fluid.optimizer.Adam(
            learning_rate=fluid.layers.exponential_decay(learning_rate=base_lr,
                                                         decay_steps=500,
                                                         decay_rate=0.969,
                                                         staircase=True))

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
예제 #4
0
def test_print_on_rank():
    """test ps print on rank"""
    role = role_maker.UserDefinedRoleMaker(
        is_collective=False,
        init_gloo=False,
        current_id=0,
        role=role_maker.Role.WORKER,
        worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
        server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
    fleet.init(role)

    fleet.util.print_on_rank("test_print_on_rank0 ... ok", 0)
예제 #5
0
    def test_tr_rolemaker(self):
        ro = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
            role=role_maker.Role.WORKER,
            current_id=0,
            worker_num=2)

        self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints())
        self.assertTrue(ro._is_worker())
        self.assertEqual(ro._role_id(), 0)
예제 #6
0
    def test_ps_rolemaker(self):

        ro = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
            role=role_maker.Role.SERVER,
            current_id=0,
            worker_num=2)
        self.assertEqual(ro._server_num(), 2)
        ro._generate_role()
        self.assertTrue(ro._is_server())
        self.assertEqual(ro._role_id(), 0)
    def build_role(self, args):

        if args.role.upper() == "PSERVER":
            role = role_maker.UserDefinedRoleMaker(
                is_collective=False,
                init_gloo=False,
                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.SERVER,
                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
        else:
            role = role_maker.UserDefinedRoleMaker(
                is_collective=False,
                init_gloo=False,
                path=args.gloo_path,
                current_id=args.current_id,
                role=role_maker.Role.WORKER,
                worker_endpoints=args.trainer_endpoints.split(","),
                server_endpoints=args.endpoints.split(","))
        self.role = role
        return role
예제 #8
0
def test_file_shard():
    """test ps file shard"""
    role = role_maker.UserDefinedRoleMaker(
        is_collective=False,
        init_gloo=False,
        current_id=0,
        role=role_maker.Role.WORKER,
        worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
        server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
    fleet.init(role)

    files = fleet.util.get_file_shard(["file1", "file2", "file3"])
    print(files)
    assert len(files) == 2
예제 #9
0
    def test_get_file_shard(self):
        import paddle.distributed.fleet as fleet
        self.assertRaises(Exception, fleet.util.get_file_shard, "files")

        role = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet.init(role)

        files = fleet.util.get_file_shard(["1", "2", "3"])
        self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
예제 #10
0
    def test_tr_rolemaker(self):
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_tr_rolemaker")
            return

        ro = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
            role=role_maker.Role.WORKER,
            current_id=0,
            worker_num=2)
        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
        self.assertTrue(ro.is_worker())
        self.assertEqual(ro.role_id(), 0)
    def run_ut(self):
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True

        training_role = os.getenv("TRAINING_ROLE", "TRAINER")

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER
            if training_role == "TRAINER" else role_maker.Role.SERVER,
            worker_num=1,
            server_endpoints=["127.0.0.1:6002"])

        if training_role == "TRAINER":
            self.run_trainer(role, strategy)
        else:
            self.run_pserver(role, strategy)
예제 #12
0
    def test_get_file_shard(self):
        self.assertRaises(Exception, fleet_util.get_file_shard, "files")
        try:
            import netifaces
        except:
            print("warning: no netifaces, skip test_get_file_shard")
            return

        role = role_maker.UserDefinedRoleMaker(
            is_collective=False,
            init_gloo=False,
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
        fleet_util._set_role_maker(role)
        files = fleet_util.get_file_shard(["1", "2", "3"])
        self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
예제 #13
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=endpoints)

        fleet.init(role)
        loss, acc, _ = self.net()
        optimizer = fluid.optimizer.Adam(base_lr)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
예제 #14
0
    def test_pserver(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

        fleet.init(role)

        batch_size = 128
        is_sparse = True
        is_distribute = False

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}

        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)

        optimizer = fluid.optimizer.SGD(0.1)
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)
예제 #15
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.SERVER,
            worker_num=2,
            server_endpoints=endpoints)

        fleet.init(role)
        loss = self.net()
        scheduler = paddle.optimizer.lr.NoamDecay(
            d_model=0.01, warmup_steps=100, verbose=True)
        optimizer = fluid.optimizer.Adam(scheduler)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
        fleet.init_server()
예제 #16
0
    def test(self):
        endpoints = [
            "127.0.0.1:36004", "127.0.0.1:36005", "127.0.0.1:36006",
            "127.0.0.1:36007"
        ]

        role = role_maker.UserDefinedRoleMaker(current_id=0,
                                               role=role_maker.Role.SERVER,
                                               worker_num=2,
                                               server_endpoints=endpoints)

        fleet.init(role)
        loss = self.net()
        scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=base_lr,
                                                         gamma=0.999,
                                                         verbose=True)
        optimizer = fluid.optimizer.Adam(scheduler)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(loss)
        fleet.init_server()
    def test_communicator_async(self):
        role = role_maker.UserDefinedRoleMaker(
            current_id=0,
            role=role_maker.Role.WORKER,
            worker_num=2,
            server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])

        fleet.init(role)
        avg_cost = self.net()

        optimizer = fluid.optimizer.SGD(0.01)

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

        os.environ["TEST_MODE"] = "1"
        fleet.init_worker()
        time.sleep(10)
        fleet.stop_worker()
예제 #18
0
def fit():
    EPOCH_NUM = 3
    BATCH_SIZE = 128
    type_size = 10
    role = role_maker.UserDefinedRoleMaker(
        current_id=0,
        role=role_maker.Role.SERVER,
        worker_num=2,
        server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])

    fleet.init(role)
    strategy = paddle.distributed.fleet.DistributedStrategy()
    strategy.a_sync = True

    type_size = createDataList('F:/机器学习/CNN/train',
                               'D:/cnn/cnn.model.data' + "/")
    # 用于训练的数据提供器
    train_reader = dataReader("D:/cnn/cnn.model.data/trainer.list")
    train_reader = paddle.batch(paddle.reader.shuffle(reader=train_reader,
                                                      buf_size=BATCH_SIZE *
                                                      100),
                                batch_size=BATCH_SIZE)
    test_reader = dataReader("D:/cnn/cnn.model.data/test.list")
    test_reader = paddle.batch(paddle.reader.shuffle(reader=test_reader,
                                                     buf_size=BATCH_SIZE *
                                                     100),
                               batch_size=BATCH_SIZE)
    data_shape = [3, 32, 32]
    paddle.enable_static()
    images = fluid.layers.data(name='images',
                               shape=data_shape,
                               dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # 获取分类器
    predict = networkConfiguration(images, type_size)

    # 定义损失函数和准确率
    cost = fluid.layers.cross_entropy(input=predict, label=label)  # 交叉熵
    avg_cost = fluid.layers.mean(cost)  # 计算cost中所有元素的平均值
    acc = fluid.layers.accuracy(input=predict, label=label)  # 使用输入和标签计算准确率
    # 定义优化方法
    test_program = fluid.default_main_program().clone(for_test=True)  # 获取测试程序
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)  # 定义优化方法
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        fleet.init_worker()
        ########## 模型训练&模型评估 ##########
        # 创建Executor
        use_cuda = False  # 定义使用CPU还是GPU,使用CPU时use_cuda=False
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
    print("完成")

    # 定义数据映射器
    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
    for pass_id in range(EPOCH_NUM):
        # 开始训练
        for batch_id, data in enumerate(train_reader()):  # 遍历train_reader
            train_cost, train_acc = exe.run(
                program=fluid.default_main_program(),  # 运行主程序
                feed=feeder.feed(data),  # 喂入一个batch的数据
                fetch_list=[avg_cost, acc])  # fetch均方误差和准确率
            # 每100次batch打印一次训练、进行一次测试
            if batch_id % 20 == 0:
                print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' %
                      (pass_id, batch_id, train_cost[0], train_acc[0]))
        # 开始测试
        test_costs = []  # 测试的损失值
        test_accs = []  # 测试的准确率
        for batch_id, data in enumerate(test_reader()):
            test_cost, test_acc = exe.run(
                program=test_program,  # 执行训练程序
                feed=feeder.feed(data),  # 喂入数据
                fetch_list=[avg_cost, acc])  # fetch误差、准确率
            test_costs.append(test_cost[0])  # 记录每个batch的损失值
            test_accs.append(test_acc[0])  # 记录每个batch的准确率

        test_cost = (sum(test_costs) / len(test_costs))  # 计算误差平均值
        test_acc = (sum(test_accs) / len(test_accs))  # 计算准确率平均值
        print('Test:%d, Cost:%0.5f, ACC:%0.5f' %
              (pass_id, test_cost, test_acc))
    save(predict, "D:/cnn/cnn.model", exe)