Exemplo n.º 1
0
def send_mail(message: MIMEText):
    """
    Send ``message`` to specify email address.

    :param message: the message to be sent.
    :return:
    """
    try:
        server_host = Config.get_property("smtp.server-host")
        server_port = Config.get_property("smtp.server-port")
        user = Config.get_property("smtp.user")
        password = Config.get_property("smtp.password")
        receiver = Config.get_property("smtp.receiver")

        if None in [server_host, server_port, user, password, receiver] or \
                "" in [server_host, server_port, user, password, receiver]:
            return True

        send_from = __format_addr("noreply", user)
        send_to = __format_addr(getpass.getuser(), receiver)

        message["From"] = send_from
        message["To"] = send_to

        smtp = smtplib.SMTP(server_host, server_port)
        smtp.login(user, password)
        smtp.sendmail(user, [
            receiver,
        ], message.as_string())
        smtp.quit()
        return True
    except Exception as e:
        print(e)
Exemplo n.º 2
0
    def memory_free(cls, require_memory: Union[int, str] = None) -> bool:
        """
        check memory utilization

        :param require_memory: the memory current task required. the type of ``require_memory`` can be int(the unit is
            Byte) or str(number + unit, for example, '123KB', '456 MB', '789MiB').
        :return: a bool value
        """
        if require_memory is None:
            require_memory = Config.get_property("scheduler.default-memory")
        require_memory = cls.parse_memory_value(require_memory)

        mem = psutil.virtual_memory()
        total = mem.total
        available = mem.available
        cls.logger.debug("memory utilization: %.2f%%{available: %.3fGiB, total: %.3fGiB}",
                         100 * (total - available) / total,
                         ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, available),
                         ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, total))
        available = mem.available - require_memory

        utilization_limit = Config.get_property("utilization-limit.memory")
        if available < 0 or available / total < 1 - utilization_limit:
            return False

        remain_limit = Config.get_property("remain-limit.memory")
        remain_limit = cls.parse_memory_value(remain_limit)
        if available < remain_limit:
            return False

        return True
Exemplo n.º 3
0
    def __interrupt(self, task: Task, interrupt_from: str) -> None:
        """
        When there is insufficient memory(or cuda memory) during the scheduling process, the task will be interrupt and
        the status of task is set to ``TaskStatus.INTERRUPT``.

        If interrupt occurs in ``load`` stage, the task process will be killed, and the status is set to
        ``TaskStatus.AVAILABLE``. Then, the task is added to available task queue, and waiting for the next schedule.

        If interrupt occurs in ``train`` stage, the task process will be reserved, and the status is set to
        ``TaskStatus.WAITING``. Then, the task is added to waiting task queue, and waiting for the next scheduler.

        The exception is if the maximum number of retries(``load`` and ``train`` stage are count separately) is reached,
        the status of task is set to ``TaskStatus.EXCEPTION``, and the task process will be killed.

        :param task: the task which interrupted.
        :param interrupt_from: the stage occurs OOM(or cuda OOM), its value only can be 'load' or 'train'
        :return:
        """
        if interrupt_from == "LOAD":
            if task.load_numbers < Config.get_property("scheduler.load-nretry"):
                task.exit()
                self.group.move_task(task.task_id, task.status, TaskStatus.AVAILABLE)
            else:
                task.exit()
                self.group.report_exception(task.task_id, "load", "LoadNumbersExceed")
                self.group.move_task(task.task_id, task.status, TaskStatus.EXCEPTION)
        else:
            if task.train_numbers < Config.get_property("scheduler.train-nretry"):
                self.group.move_task(task.task_id, task.status, TaskStatus.WAITING)
            else:
                task.exit()
                self.group.report_exception(task.task_id, "train", "TrainNumbersExceed")
                self.group.move_task(task.task_id, task.status, TaskStatus.EXCEPTION)
Exemplo n.º 4
0
    def open(self):
        self.in_working = True
        workdir = Config.get_property("workdir")
        os.makedirs(workdir, exist_ok=True)
        self.__pre_workdir = os.path.abspath(os.curdir)
        os.chdir(workdir)

        MessageListener.start()
Exemplo n.º 5
0
    def assign_cuda(cls, require_cuda_memory=None, device: str = None):
        """
        assign a cuda device.

        :param require_cuda_memory: the cuda memory current task required.
        :param device: specify a device, then other device will be ignored.
        :return: An integer represents the cuda id
        """
        if require_cuda_memory is None:
            require_cuda_memory = Config.get_property("scheduler.default-cuda-memory")
        require_cuda_memory = cls.parse_memory_value(require_cuda_memory)

        gpus = NGPUInfo.list_gpus()
        if device is not None:
            try:
                device = device.replace("cuda:", "")
                device_id = int(device)
                gpus = [gpus[device_id], ]
            except:
                pass

        for g in gpus:
            gpu: ngpuinfo.NGPU = g
            total = gpu.mem_total()
            available = gpu.mem_free()
            cls.logger.debug("cuda:%d memory utilization: %.2f%%{available: %.3fGiB, total: %.3fGiB}",
                             gpu.id, 100 * (total - available) / total,
                             ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, available),
                             ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, total))
            available = gpu.mem_free() - require_cuda_memory

            utilization_limit = Config.get_property("utilization-limit.cuda-memory")
            if available < 0 or available / total < 1 - utilization_limit:
                continue

            remain_limit = Config.get_property("remain-limit.cuda-memory")
            remain_limit = cls.parse_memory_value(remain_limit)
            if available < remain_limit:
                continue

            cls.logger.debug("select cuda:%d", gpu.id)
            return gpu.id

        cls.logger.debug("no free gpu.")
        return -1
Exemplo n.º 6
0
 def execute(self, group: TaskGroup) -> None:
     if not self.in_working:
         raise ValueError("Please use 'with Fedflow()'")
     if Config.get_property("task.directory-grouping"):
         os.makedirs(group.group_name, exist_ok=True)
         with WorkDirContext(group.group_name):
             group.workdir = os.path.abspath(".")
             GroupScheduler.schedule(group)
     else:
         GroupScheduler.schedule(group)
Exemplo n.º 7
0
def detect_logging():
    os.makedirs("logs", exist_ok=True)
    conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "resources", "logging.yaml")
    with open(conf_path, "r") as f:
        d = yaml.load(f, yaml.SafeLoader)
    if Config.get_property("debug"):
        set_debug(d)
        d["root"]["level"] = "INFO"
    logging.config.dictConfig(d)
Exemplo n.º 8
0
    def cpu_free(cls) -> bool:
        """
        check cpu utilization.

        :return: a bool value.
        """
        cpu_precent = psutil.cpu_percent()
        utilization_limit = Config.get_property("utilization-limit.cpu")
        cls.logger.debug("CPU utilization: %.2f%%", cpu_precent)
        return cpu_precent < 100 * utilization_limit
Exemplo n.º 9
0
    def send_group_result(cls, name: str, result: dict) -> None:
        """
        send report of a group.

        :param name: group name
        :param result: the result need to be reported.
        :return:
        """
        html = group_template(name, result)
        message = MIMEText(html, "html", "utf-8")
        message["Subject"] = Header("Fedflow %s report" % name, "utf-8")
        if Config.get_property("smtp.enable"):
            if send_mail(message):
                cls.logger.info("send group report mail.")
            else:
                cls.logger.error("send group report mail failed.")

        reports_dir = os.path.join(Config.get_property("workdir"), "reports")
        os.makedirs(reports_dir, exist_ok=True)
        filename = os.path.join(reports_dir, "%s.html" % name)
        with open(filename, "wb") as f:
            f.write(html.encode("utf-8"))
Exemplo n.º 10
0
    def start(cls) -> None:
        """
        Start schedule tasks

        :return:
        """
        workdir = Config.get_property("workdir")
        workdir = os.path.abspath(workdir)
        os.makedirs(workdir, exist_ok=True)
        os.chdir(workdir)

        MessageListener.start()

        for g in cls.groups:
            if Config.get_property("task.directory-grouping"):
                os.makedirs(g.group_name, exist_ok=True)
                with WorkDirContext(g.group_name):
                    g.workdir = os.path.abspath(".")
                    GroupScheduler.schedule(g)
            else:
                GroupScheduler.schedule(g)

        MessageListener.stop()
Exemplo n.º 11
0
    def add_task(self, task: Task) -> None:
        """
        Add a task to this group.

        :param task: the task to be added to this group
        :return:
        """
        if task.device is None:
            task.device = self.device

        if not Config.get_property("task.allow-duplicate-id"
                                   ) and task.task_id in TaskGroup.global_ids:
            raise ValueError("Duplicate id[%s] in global." % str(task.task_id))
        TaskGroup.global_ids.add(task.task_id)
        if task.task_id in self.task_ids:
            raise ValueError("Duplicate id[%s] in group." % str(task.task_id))
        self.task_ids.add(task.task_id)

        self.tasks[task.status][task.task_id] = task
        self.task_number += 1
Exemplo n.º 12
0
    def __init__(self,
                 group_name: str = None,
                 *,
                 estimate_memory: Union[int, str] = None,
                 estimate_cuda_memory: Union[int, str] = None,
                 device=None):
        """
        Construct a task group.

        :param group_name: the group name, it only used for create group directory and display in report.
        :param estimate_memory: maximum memory expected to be used for every task in this group.
        :param estimate_cuda_memory: maximum cuda memory expected to be used for every task in this group.
        :param device: specify device the tasks in this group used, if it's None, the device will be decided by
        scheduler.
        """
        super(TaskGroup, self).__init__()
        self.index = -1
        self.__group_name = group_name
        self.estimate_memory = estimate_memory
        self.estimate_cuda_memory = estimate_cuda_memory
        self.__device = device
        self.auto_adjust_memory = self.estimate_memory is None
        self.auto_adjust_cuda_memory = self.estimate_cuda_memory is None
        if not Config.get_property("scheduler.auto-adjust"):
            self.auto_adjust_memory = False
            self.auto_adjust_cuda_memory = False

        self.task_ids = set()
        self.tasks = {}
        for ts in TaskStatus.__members__.values():
            self.tasks[ts] = {}

        self.task_number = 0
        self.success_number = 0
        self.failed_number = 0

        self.result = {}

        self.workdir = None
Exemplo n.º 13
0
 def test_load(self):
     self.assertEqual(Config.get_property("workdir"), ".")
     Config.load("config.yaml")
     self.assertEqual(Config.get_property("workdir"), "res")
     self.assertEqual(Config.get_property("utilization-limit.cpu"), 0.9)
Exemplo n.º 14
0
 def test_modify(self):
     Config.set_property("utilization-limit.cpu", 0.5)
     self.assertEqual(Config.get_property("utilization-limit.cpu"), 0.5)
Exemplo n.º 15
0
    def schedule(cls, group: TaskGroup) -> None:
        """
        The entry of schedule.

        This method is blocked.

        :param group: the task group waiting for scheduling.
        :return:
        """
        cls.logger.info("schedule group #%s", group.index)
        MessageListener.register_default_handler(TaskHandler(group))

        schedule_round = 1
        while not group.finished():
            process_number, waiting_number, training_number = group.numbers()
            cls.logger.info("schedule round #%d{waiting: %d, training: %d, process: %d}",
                            schedule_round, waiting_number, training_number, process_number)
            schedule_round += 1

            max_process = Config.get_property("scheduler.max-process")
            if process_number < max_process or max_process == 0:
                if cls.cpu_free():
                    # schedule load
                    max_waiting = Config.get_property("scheduler.max-waiting")
                    if waiting_number < max_waiting or max_waiting == 0:
                        # start init task
                        task: Task = group.retrieve_task(TaskStatus.INIT)
                        if task is not None:
                            cls.logger.info("task{%s} start", task.task_id)
                            task.start()
                            time.sleep(3)
                        else:
                            cls.logger.debug("no init task exists.")

                        # start available task
                        task: Task = group.retrieve_task(TaskStatus.AVAILABLE)
                        if task is not None:
                            require_memory = task.estimate_memory
                            if require_memory is None:
                                require_memory = group.estimate_memory
                            if cls.memory_free(require_memory):
                                cls.logger.info("task{%s} start load", task.task_id)
                                task.start_load()
                            else:
                                cls.logger.warning("memory utilization is too high.")
                        else:
                            cls.logger.debug("no available task exists.")
                    else:
                        cls.logger.info("the maximum number of waiting has been reached.")

                    # schedule train
                    task: Task = group.retrieve_task(TaskStatus.WAITING)
                    if task is not None:
                        require_cuda_memory = task.estimate_cuda_memory
                        if require_cuda_memory is None:
                            require_cuda_memory = group.estimate_cuda_memory
                        device_id = cls.assign_cuda(require_cuda_memory, task.device)
                        if device_id >= 0:
                            device = "cuda:%d" % device_id
                            cls.logger.info("task{%s} start train in %s", task.task_id, device)
                            task.start_train(device)
                        else:
                            cls.logger.warning("GPU utilization is too high.")
                    else:
                        cls.logger.info("no waiting task exists.")

                else:
                    cls.logger.warning("CPU utilization is too high.")

            else:
                cls.logger.info("the maximum number of processes has been reached.")

            cls.logger.info("sleeping...")
            time.sleep(Config.get_property("scheduler.interval"))

        # send task group report
        Mail.send_group_result(group.group_name, group.result)
Exemplo n.º 16
0
import sys

from fedflow.config import Config

if __name__ == "__main__":
    if len(sys.argv) > 1:
        if sys.argv[1] == "generate-config":
            if len(sys.argv) > 2:
                Config.generate_config(sys.argv[2])
            else:
                Config.generate_config(None)
Exemplo n.º 17
0
import fedflow_test

import os

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision.datasets import mnist
from torchvision.transforms import transforms

from fedflow import Task, TaskGroup, FedFlow
from fedflow.config import Config
from fedflow.utils.trainer.supervised_trainer import SupervisedTrainer

Config.set_property("debug", True)
Config.set_property("scheduler.interval", 2)

datasets_path = os.path.join(os.path.abspath("."), "datasets")


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4 * 4 * 50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
Exemplo n.º 18
0
def detect_config():
    if os.path.exists("config.yaml"):
        Config.load("config.yaml")