def send_mail(message: MIMEText): """ Send ``message`` to specify email address. :param message: the message to be sent. :return: """ try: server_host = Config.get_property("smtp.server-host") server_port = Config.get_property("smtp.server-port") user = Config.get_property("smtp.user") password = Config.get_property("smtp.password") receiver = Config.get_property("smtp.receiver") if None in [server_host, server_port, user, password, receiver] or \ "" in [server_host, server_port, user, password, receiver]: return True send_from = __format_addr("noreply", user) send_to = __format_addr(getpass.getuser(), receiver) message["From"] = send_from message["To"] = send_to smtp = smtplib.SMTP(server_host, server_port) smtp.login(user, password) smtp.sendmail(user, [ receiver, ], message.as_string()) smtp.quit() return True except Exception as e: print(e)
def memory_free(cls, require_memory: Union[int, str] = None) -> bool: """ check memory utilization :param require_memory: the memory current task required. the type of ``require_memory`` can be int(the unit is Byte) or str(number + unit, for example, '123KB', '456 MB', '789MiB'). :return: a bool value """ if require_memory is None: require_memory = Config.get_property("scheduler.default-memory") require_memory = cls.parse_memory_value(require_memory) mem = psutil.virtual_memory() total = mem.total available = mem.available cls.logger.debug("memory utilization: %.2f%%{available: %.3fGiB, total: %.3fGiB}", 100 * (total - available) / total, ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, available), ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, total)) available = mem.available - require_memory utilization_limit = Config.get_property("utilization-limit.memory") if available < 0 or available / total < 1 - utilization_limit: return False remain_limit = Config.get_property("remain-limit.memory") remain_limit = cls.parse_memory_value(remain_limit) if available < remain_limit: return False return True
def __interrupt(self, task: Task, interrupt_from: str) -> None: """ When there is insufficient memory(or cuda memory) during the scheduling process, the task will be interrupt and the status of task is set to ``TaskStatus.INTERRUPT``. If interrupt occurs in ``load`` stage, the task process will be killed, and the status is set to ``TaskStatus.AVAILABLE``. Then, the task is added to available task queue, and waiting for the next schedule. If interrupt occurs in ``train`` stage, the task process will be reserved, and the status is set to ``TaskStatus.WAITING``. Then, the task is added to waiting task queue, and waiting for the next scheduler. The exception is if the maximum number of retries(``load`` and ``train`` stage are count separately) is reached, the status of task is set to ``TaskStatus.EXCEPTION``, and the task process will be killed. :param task: the task which interrupted. :param interrupt_from: the stage occurs OOM(or cuda OOM), its value only can be 'load' or 'train' :return: """ if interrupt_from == "LOAD": if task.load_numbers < Config.get_property("scheduler.load-nretry"): task.exit() self.group.move_task(task.task_id, task.status, TaskStatus.AVAILABLE) else: task.exit() self.group.report_exception(task.task_id, "load", "LoadNumbersExceed") self.group.move_task(task.task_id, task.status, TaskStatus.EXCEPTION) else: if task.train_numbers < Config.get_property("scheduler.train-nretry"): self.group.move_task(task.task_id, task.status, TaskStatus.WAITING) else: task.exit() self.group.report_exception(task.task_id, "train", "TrainNumbersExceed") self.group.move_task(task.task_id, task.status, TaskStatus.EXCEPTION)
def open(self): self.in_working = True workdir = Config.get_property("workdir") os.makedirs(workdir, exist_ok=True) self.__pre_workdir = os.path.abspath(os.curdir) os.chdir(workdir) MessageListener.start()
def assign_cuda(cls, require_cuda_memory=None, device: str = None): """ assign a cuda device. :param require_cuda_memory: the cuda memory current task required. :param device: specify a device, then other device will be ignored. :return: An integer represents the cuda id """ if require_cuda_memory is None: require_cuda_memory = Config.get_property("scheduler.default-cuda-memory") require_cuda_memory = cls.parse_memory_value(require_cuda_memory) gpus = NGPUInfo.list_gpus() if device is not None: try: device = device.replace("cuda:", "") device_id = int(device) gpus = [gpus[device_id], ] except: pass for g in gpus: gpu: ngpuinfo.NGPU = g total = gpu.mem_total() available = gpu.mem_free() cls.logger.debug("cuda:%d memory utilization: %.2f%%{available: %.3fGiB, total: %.3fGiB}", gpu.id, 100 * (total - available) / total, ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, available), ByteUnits.convert(ByteUnits.iB, ByteUnits.GiB, total)) available = gpu.mem_free() - require_cuda_memory utilization_limit = Config.get_property("utilization-limit.cuda-memory") if available < 0 or available / total < 1 - utilization_limit: continue remain_limit = Config.get_property("remain-limit.cuda-memory") remain_limit = cls.parse_memory_value(remain_limit) if available < remain_limit: continue cls.logger.debug("select cuda:%d", gpu.id) return gpu.id cls.logger.debug("no free gpu.") return -1
def execute(self, group: TaskGroup) -> None: if not self.in_working: raise ValueError("Please use 'with Fedflow()'") if Config.get_property("task.directory-grouping"): os.makedirs(group.group_name, exist_ok=True) with WorkDirContext(group.group_name): group.workdir = os.path.abspath(".") GroupScheduler.schedule(group) else: GroupScheduler.schedule(group)
def detect_logging(): os.makedirs("logs", exist_ok=True) conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources", "logging.yaml") with open(conf_path, "r") as f: d = yaml.load(f, yaml.SafeLoader) if Config.get_property("debug"): set_debug(d) d["root"]["level"] = "INFO" logging.config.dictConfig(d)
def cpu_free(cls) -> bool: """ check cpu utilization. :return: a bool value. """ cpu_precent = psutil.cpu_percent() utilization_limit = Config.get_property("utilization-limit.cpu") cls.logger.debug("CPU utilization: %.2f%%", cpu_precent) return cpu_precent < 100 * utilization_limit
def send_group_result(cls, name: str, result: dict) -> None: """ send report of a group. :param name: group name :param result: the result need to be reported. :return: """ html = group_template(name, result) message = MIMEText(html, "html", "utf-8") message["Subject"] = Header("Fedflow %s report" % name, "utf-8") if Config.get_property("smtp.enable"): if send_mail(message): cls.logger.info("send group report mail.") else: cls.logger.error("send group report mail failed.") reports_dir = os.path.join(Config.get_property("workdir"), "reports") os.makedirs(reports_dir, exist_ok=True) filename = os.path.join(reports_dir, "%s.html" % name) with open(filename, "wb") as f: f.write(html.encode("utf-8"))
def start(cls) -> None: """ Start schedule tasks :return: """ workdir = Config.get_property("workdir") workdir = os.path.abspath(workdir) os.makedirs(workdir, exist_ok=True) os.chdir(workdir) MessageListener.start() for g in cls.groups: if Config.get_property("task.directory-grouping"): os.makedirs(g.group_name, exist_ok=True) with WorkDirContext(g.group_name): g.workdir = os.path.abspath(".") GroupScheduler.schedule(g) else: GroupScheduler.schedule(g) MessageListener.stop()
def add_task(self, task: Task) -> None: """ Add a task to this group. :param task: the task to be added to this group :return: """ if task.device is None: task.device = self.device if not Config.get_property("task.allow-duplicate-id" ) and task.task_id in TaskGroup.global_ids: raise ValueError("Duplicate id[%s] in global." % str(task.task_id)) TaskGroup.global_ids.add(task.task_id) if task.task_id in self.task_ids: raise ValueError("Duplicate id[%s] in group." % str(task.task_id)) self.task_ids.add(task.task_id) self.tasks[task.status][task.task_id] = task self.task_number += 1
def __init__(self, group_name: str = None, *, estimate_memory: Union[int, str] = None, estimate_cuda_memory: Union[int, str] = None, device=None): """ Construct a task group. :param group_name: the group name, it only used for create group directory and display in report. :param estimate_memory: maximum memory expected to be used for every task in this group. :param estimate_cuda_memory: maximum cuda memory expected to be used for every task in this group. :param device: specify device the tasks in this group used, if it's None, the device will be decided by scheduler. """ super(TaskGroup, self).__init__() self.index = -1 self.__group_name = group_name self.estimate_memory = estimate_memory self.estimate_cuda_memory = estimate_cuda_memory self.__device = device self.auto_adjust_memory = self.estimate_memory is None self.auto_adjust_cuda_memory = self.estimate_cuda_memory is None if not Config.get_property("scheduler.auto-adjust"): self.auto_adjust_memory = False self.auto_adjust_cuda_memory = False self.task_ids = set() self.tasks = {} for ts in TaskStatus.__members__.values(): self.tasks[ts] = {} self.task_number = 0 self.success_number = 0 self.failed_number = 0 self.result = {} self.workdir = None
def test_load(self): self.assertEqual(Config.get_property("workdir"), ".") Config.load("config.yaml") self.assertEqual(Config.get_property("workdir"), "res") self.assertEqual(Config.get_property("utilization-limit.cpu"), 0.9)
def test_modify(self): Config.set_property("utilization-limit.cpu", 0.5) self.assertEqual(Config.get_property("utilization-limit.cpu"), 0.5)
def schedule(cls, group: TaskGroup) -> None: """ The entry of schedule. This method is blocked. :param group: the task group waiting for scheduling. :return: """ cls.logger.info("schedule group #%s", group.index) MessageListener.register_default_handler(TaskHandler(group)) schedule_round = 1 while not group.finished(): process_number, waiting_number, training_number = group.numbers() cls.logger.info("schedule round #%d{waiting: %d, training: %d, process: %d}", schedule_round, waiting_number, training_number, process_number) schedule_round += 1 max_process = Config.get_property("scheduler.max-process") if process_number < max_process or max_process == 0: if cls.cpu_free(): # schedule load max_waiting = Config.get_property("scheduler.max-waiting") if waiting_number < max_waiting or max_waiting == 0: # start init task task: Task = group.retrieve_task(TaskStatus.INIT) if task is not None: cls.logger.info("task{%s} start", task.task_id) task.start() time.sleep(3) else: cls.logger.debug("no init task exists.") # start available task task: Task = group.retrieve_task(TaskStatus.AVAILABLE) if task is not None: require_memory = task.estimate_memory if require_memory is None: require_memory = group.estimate_memory if cls.memory_free(require_memory): cls.logger.info("task{%s} start load", task.task_id) task.start_load() else: cls.logger.warning("memory utilization is too high.") else: cls.logger.debug("no available task exists.") else: cls.logger.info("the maximum number of waiting has been reached.") # schedule train task: Task = group.retrieve_task(TaskStatus.WAITING) if task is not None: require_cuda_memory = task.estimate_cuda_memory if require_cuda_memory is None: require_cuda_memory = group.estimate_cuda_memory device_id = cls.assign_cuda(require_cuda_memory, task.device) if device_id >= 0: device = "cuda:%d" % device_id cls.logger.info("task{%s} start train in %s", task.task_id, device) task.start_train(device) else: cls.logger.warning("GPU utilization is too high.") else: cls.logger.info("no waiting task exists.") else: cls.logger.warning("CPU utilization is too high.") else: cls.logger.info("the maximum number of processes has been reached.") cls.logger.info("sleeping...") time.sleep(Config.get_property("scheduler.interval")) # send task group report Mail.send_group_result(group.group_name, group.result)
import sys from fedflow.config import Config if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] == "generate-config": if len(sys.argv) > 2: Config.generate_config(sys.argv[2]) else: Config.generate_config(None)
import fedflow_test import os import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torchvision.datasets import mnist from torchvision.transforms import transforms from fedflow import Task, TaskGroup, FedFlow from fedflow.config import Config from fedflow.utils.trainer.supervised_trainer import SupervisedTrainer Config.set_property("debug", True) Config.set_property("scheduler.interval", 2) datasets_path = os.path.join(os.path.abspath("."), "datasets") class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 20, 5, 1) self.conv2 = nn.Conv2d(20, 50, 5, 1) self.fc1 = nn.Linear(4 * 4 * 50, 500) self.fc2 = nn.Linear(500, 10) self.softmax = nn.Softmax(dim=1) def forward(self, x): x = F.relu(self.conv1(x))
def detect_config(): if os.path.exists("config.yaml"): Config.load("config.yaml")