예제 #1
0
def jsonToObject():
    topology_manager = load_topology_manager(
        "f1404379d87f34bda07aba3c530bd146")
    GflNode.init_node()
    node = GflNode.default_node
    topology_manager.add_client(client_node=node, add_into_topology=True)
    print(topology_manager.get_index_by_node(node))
    topology_manager.generate_topology()
    save_topology_manager("123", topology_manager)
    temp_topology_manager = load_topology_manager("123")
    print(temp_topology_manager.topology)
예제 #2
0
    def setUp(self) -> None:
        # 共有两个训练节点,有一个任务,这个任务需要训练两轮。需要1个aggregator_scheduler,需要2个jobTrainerScheduler
        # 将这3个调度器放入队列
        # 队列非空时,从队列中取出第一个调度器,判断是否可以开始执行。
        # 可以执行,则执行。执行完毕之后,判断是否达到任务的要求(训练是否到达了指定轮数)
        # 没有达到任务的要求,则放入队尾
        # 不可以执行,则放入队尾
        self.dataset = generate_dataset()
        print("生成的dataset_id:" + self.dataset.dataset_id)
        self.job = generate_job()
        print("生成的job_id:" + self.job.job_id)
        self.job.mount_dataset(self.dataset)

        self.job_2 = generate_job()
        self.job_2.job_id = self.job.job_id
        self.job_2.mount_dataset(self.dataset)
        print("生成的job_2_id:" + self.job_2.job_id)

        self.job_3 = generate_job()
        self.job_3.job_id = self.job.job_id
        self.job_3.mount_dataset(self.dataset)
        print("生成的job_3_id:" + self.job_3.job_id)

        GflNode.init_node()
        node1 = GflNode.default_node
        self.aggregator_scheduler = JobAggregateScheduler(node=None, job=self.job)

        self.jobTrainerScheduler_1 = JobTrainScheduler(node=node1, job=self.job_2)
        JobManager.init_job_sqlite(self.job_2.job_id)
        client1 = ClientEntity(self.jobTrainerScheduler_1.node.address,
                               self.jobTrainerScheduler_1.job.dataset.dataset_id,
                               self.jobTrainerScheduler_1.node.pub_key)
        save_client(self.job_2.job_id, client=client1)
        self.jobTrainerScheduler_1.register()

        GflNode.init_node()
        node2 = GflNode.default_node
        self.jobTrainerScheduler_2 = JobTrainScheduler(node=node2, job=self.job_3)
        client2 = ClientEntity(self.jobTrainerScheduler_2.node.address,
                               self.jobTrainerScheduler_2.job.dataset.dataset_id,
                               self.jobTrainerScheduler_2.node.pub_key)
        save_client(self.job_3.job_id, client=client2)
        self.jobTrainerScheduler_2.register()

        # 将调度器放入队列
        self.list = []
        self.list.append(self.aggregator_scheduler)
        self.list.append(self.jobTrainerScheduler_1)
        self.list.append(self.jobTrainerScheduler_2)
예제 #3
0
    def init(cls, force):
        if os.path.exists(GflConf.home_dir):
            if force:
                logging.shutdown()
                shutil.rmtree(GflConf.home_dir)
            else:
                raise ValueError("homedir not empty.")
        # create home dir
        os.makedirs(GflConf.home_dir)

        # generate config file
        GflConf.generate_config(PathUtils.join(GflConf.home_dir,
                                               "config.yaml"))
        # generate node address and key
        GflNode.init_node()
        # create data directories
        Lfs.init()
예제 #4
0
 def setUp(self) -> None:
     self.dataset = generate_dataset()
     print("dataset_id:" + self.dataset.dataset_id)
     self.job = generate_job()
     print("job_id:" + self.job.job_id)
     self.job.mount_dataset(self.dataset)
     GflNode.init_node()
     node = GflNode.default_node
     self.aggregator_scheduler = JobAggregateScheduler(node=None,
                                                       job=copy.deepcopy(
                                                           self.job),
                                                       target_num=1)
     self.jobTrainerScheduler = JobTrainScheduler(node=node,
                                                  job=copy.deepcopy(
                                                      self.job))
     JobManager.init_job_sqlite(self.job.job_id)
     client = ClientEntity(self.jobTrainerScheduler.node.address,
                           self.jobTrainerScheduler.job.dataset.dataset_id,
                           self.jobTrainerScheduler.node.pub_key)
     save_client(self.job.job_id, client=client)
예제 #5
0
    def run(cls, role, console, **kwargs):
        sys.stderr = open(os.devnull, "w")
        cls.logger = logging.getLogger("gfl")
        with Daemonizer() as (is_setup, daemonizer):
            main_pid = None
            if is_setup:
                main_pid = os.getpid()
            pid_file = PathUtils.join(GflConf.home_dir, "proc.lock")
            stdout_file = PathUtils.join(GflConf.logs_dir, "console_out")
            stderr_file = PathUtils.join(GflConf.logs_dir, "console_err")
            is_parent = daemonizer(pid_file,
                                   stdout_goto=stdout_file,
                                   stderr_goto=stderr_file)
            if is_parent:
                if console and main_pid == os.getpid():
                    Shell.startup()

        GflNode.load_node()

        if GflConf.get_property("net.mode") == "standalone":
            client_number = GflConf.get_property(
                "net.standalone.client_number")
            for _ in range(len(GflNode.standalone_nodes), client_number):
                GflNode.add_standalone_node()

            ManagerHolder.default_manager = NodeManager(
                node=GflNode.default_node, role="server")

            for i in range(client_number):
                client_manager = NodeManager(node=GflNode.standalone_nodes[i],
                                             role="client")
                ManagerHolder.standalone_managers.append(client_manager)
        else:
            ManagerHolder.default_manager = NodeManager(
                node=GflNode.default_node, role=role)

        # cls.__startup_node_managers()
        HttpListener.start()

        while HttpListener.is_alive():
            time.sleep(2)
예제 #6
0
    def setUp(self) -> None:
        self.dataset = generate_dataset()
        print("dataset_id:" + self.dataset.dataset_id)
        self.job = generate_job()
        print("job_id:" + self.job.job_id)
        self.job.mount_dataset(self.dataset)
        GflNode.init_node()
        node = GflNode.default_node
        self.jobTrainerScheduler = JobTrainScheduler(node=node, job=self.job)
        self.jobTrainerScheduler.register()

        # aggregator需要初始化随机模型
        global_params_dir = JobPath(self.job.job_id).global_params_dir(
            self.job.cur_round)
        # print("global_params_dir:"+global_params_dir)
        os.makedirs(global_params_dir, exist_ok=True)
        model_params_path = PathUtils.join(global_params_dir,
                                           self.job.job_id + '.pth')
        # print("model_params_path:"+model_params_path)
        model = Net()
        torch.save(model.state_dict(), model_params_path)
예제 #7
0
    def setUp(self) -> None:
        # GflNode.init_node()
        GflNode.load_node()
        node_server = GflNode.standalone_nodes[0]
        node_client1 = GflNode.standalone_nodes[1]
        node_client2 = GflNode.standalone_nodes[2]
        self.node_manager_server = NodeManager(node=node_server, role="server")
        self.node_manager_client1 = NodeManager(node=node_client1, role="client")
        self.node_manager_client2 = NodeManager(node=node_client2, role="client")
        # 创建job
        self.job = generate_job()

        # 创建job对应的拓扑
        topology_config = TopologyConfig()
        topology_config.with_train_node_num(2)
        topology_config.with_server_nodes([node_server.address])
        topology_config.with_client_nodes([node_client1.address, node_client2.address])
        topology_config.with_index2node([node_server.address, node_client1.address, node_client2.address])
        temp_topology_manager = CentralizedTopologyManager(topology_config)
        temp_topology_manager.generate_topology()

        # 保存job对应的拓扑
        save_topology_manager(job_id=self.job.job_id, topology_manager=temp_topology_manager)
예제 #8
0
    def test_start(self):
        # 共有两个训练节点,有一个任务,这个任务需要训练两轮。需要1个aggregator_scheduler,需要2个jobTrainerScheduler
        # 生成dataset
        self.dataset = generate_dataset()
        print("生成的dataset_id:" + self.dataset.dataset_id)

        # 生成3个一样的job
        self.job = generate_job()
        print("生成的job_id:" + self.job.job_id)
        self.job.mount_dataset(self.dataset)
        JobManager.init_job_sqlite(self.job.job_id)
        JobManager.submit_job(self.job)
        self.job_2 = generate_job()
        self.job_2.job_id = self.job.job_id
        self.job_2.mount_dataset(self.dataset)
        print("生成的job_2_id:" + self.job_2.job_id)
        self.job_3 = generate_job()
        self.job_3.job_id = self.job.job_id
        self.job_3.mount_dataset(self.dataset)
        print("生成的job_3_id:" + self.job_3.job_id)
        # 生成3个node。1个是聚合方,2个是训练方
        # 聚合方
        GflNode.init_node()
        node1 = GflNode.default_node
        # 将job和聚合方绑定
        self.job.add_server(node1)
        self.job_2.add_server(node1)
        self.job_3.add_server(node1)
        # 拓扑结构,根据job生成
        self.tpmgr = CentralizedTopologyManager(n=3, job=self.job, aggregate_node=node1)
        self.aggregator_scheduler = JobAggregateScheduler(node=node1, topology_manager=self.tpmgr, job=self.job)
        # 训练方
        GflNode.init_node()
        node2 = GflNode.default_node
        self.jobTrainerScheduler_1 = JobTrainScheduler(node=node2, topology_manager=self.tpmgr, job=self.job_2)
        # JobManager.init_job_sqlite(self.job_2.job_id)
        client1 = ClientEntity(self.jobTrainerScheduler_1.node.address,
                               self.jobTrainerScheduler_1.job.dataset.dataset_id,
                               self.jobTrainerScheduler_1.node.pub_key)
        save_client(self.job_2.job_id, client=client1)
        self.jobTrainerScheduler_1.register()
        # 加到拓扑结构当中
        self.tpmgr.add_node_into_topology(node2, 1)
        # 训练方
        GflNode.init_node()
        node3 = GflNode.default_node
        self.jobTrainerScheduler_2 = JobTrainScheduler(node=node3, topology_manager=self.tpmgr, job=self.job_3)
        client2 = ClientEntity(self.jobTrainerScheduler_2.node.address,
                               self.jobTrainerScheduler_2.job.dataset.dataset_id,
                               self.jobTrainerScheduler_2.node.pub_key)
        save_client(self.job_3.job_id, client=client2)
        self.jobTrainerScheduler_2.register()
        # 加到拓扑结构当中
        self.tpmgr.add_node_into_topology(node3, 2)
        # 生成中心化的拓扑结构
        self.tpmgr.generate_topology()
        # 将调度器放入队列
        self.list = []
        self.list.append(self.aggregator_scheduler)
        self.list.append(self.jobTrainerScheduler_1)
        self.list.append(self.jobTrainerScheduler_2)
        while len(self.list) != 0:
            for num in range(len(self.list) - 1, -1, -1):
                scheduler = self.list[num]
                if scheduler.is_finished():
                    self.list.remove(scheduler)
                else:
                    if scheduler.is_available():
                        scheduler.start()
                        if scheduler.is_finished():
                            self.list.remove(scheduler)
예제 #9
0
import os

work_dir = os.path.dirname(os.path.dirname(__file__))
os.chdir(work_dir)

from gfl.conf import GflConf
from gfl.core.manager.node import GflNode
GflConf.home_dir = "data"
GflNode.load_node()
예제 #10
0
def generate_nodes():
    GflNode.init_node()
    GflNode.add_standalone_node()
    GflNode.add_standalone_node()
    GflNode.add_standalone_node()