def run(): pid = get_running_process() if pid: raise RuntimeError( f'This computer is already being monitored. PID: {pid}') from uuid import uuid1 session_uuid = uuid1().hex with open(str(computer_singleton().config_folder / 'session.txt'), 'w') as f: f.write(session_uuid) with open(str(computer_singleton().config_folder / 'monitor.pid'), 'w') as f: f.write(str(os.getpid())) m = monitor.MonitorComputer(session_uuid) m.start({ 'os': monitor.get_os(), 'cpu.logical': psutil.cpu_count(), 'cpu.physical': psutil.cpu_count(logical=False) }) while True: with monit.section('Track'): m.track() time.sleep(60)
def run(is_check_process: bool = True, open_browser: bool = True): pid = get_running_process() if is_check_process and pid: raise RuntimeError( f'This computer is already being monitored. PID: {pid}') from uuid import uuid1 session_uuid = uuid1().hex with open(str(computer_singleton().config_folder / 'session.txt'), 'w') as f: f.write(session_uuid) with open(str(computer_singleton().config_folder / 'monitor.pid'), 'w') as f: f.write(str(os.getpid())) m = monitor.MonitorComputer(session_uuid, open_browser) m.start() i = 0 while True: with monit.section('Track', is_new_line=False): m.track() time.sleep(min(60.0, max(1.0, i / 5.0))) i += 1
def __init__(self): from labml.internal.computer.projects.api import DirectApiCaller from labml.internal.computer.configs import computer_singleton self.caller = DirectApiCaller( computer_singleton().web_api_polling, {'computer_uuid': computer_singleton().uuid}, timeout_seconds=60) self.results = []
def start(self, *, run_uuid: Optional[str] = None, checkpoint: Optional[int] = None): if run_uuid is not None: if checkpoint is None: checkpoint = -1 global_step = self.__start_from_checkpoint(run_uuid, checkpoint) else: global_step = 0 self.run.start_step = global_step self._start_tracker() tracker().set_start_global_step(global_step) if self.distributed_rank == 0: self.__print_info() if self.check_repo_dirty and self.run.is_dirty: logger.log([ ("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1) if not self.is_evaluate: if self.distributed_rank == 0: from labml.internal.computer.configs import computer_singleton computer_singleton().add_project(lab_singleton().path) self.run.save_info() self._save_pid() if self.distributed_rank == 0: if self.configs_processor is not None: self.configs_processor.add_saver( FileConfigsSaver(self.run.configs_path)) if self.web_api is not None: self.web_api.start(self.run) if self.configs_processor is not None: self.configs_processor.add_saver( self.web_api.get_configs_saver()) self.web_api.set_dynamic_handler( ExperimentDynamicUpdateHandler( self.configs_processor)) if self.wandb is not None: self.wandb.init(self.run.name, self.run.run_path) if self.configs_processor is not None: self.configs_processor.add_saver( self.wandb.get_configs_saver()) tracker().save_indicators(self.run.indicators_path) self.is_started = True return ExperimentWatcher(self)
def __init__(self, session_uuid: str, open_browser): api_caller = ApiCaller(computer_singleton().web_api.url, {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid}, timeout_seconds=120, daemon=True) self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency) self.header = Header(api_caller, frequency=computer_singleton().web_api.frequency, open_browser=open_browser) self.scanner = Scanner()
def __init__(self): api_caller = ApiCaller(computer_singleton().web_api.url, {'computer_uuid': computer_singleton().uuid}, 15) self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency) self.header = Header( api_caller, frequency=computer_singleton().web_api.frequency, open_browser=computer_singleton().web_api.open_browser) self.data = {}
def __init__(self): from labml.internal.computer.projects.api import DirectApiCaller from labml.internal.computer.configs import computer_singleton self.sync_caller = DirectApiCaller( computer_singleton().web_api_sync, {'computer_uuid': computer_singleton().uuid}, timeout_seconds=15) self.projects = None self.runs = {}
def init_db(): data_path = computer_singleton().app_folder / 'data' if not data_path.exists(): data_path.mkdir() if settings.IS_LOCAL_SETUP: Model.set_db_drivers([ FileDbDriver(PickleSerializer(), m, Path(f'{data_path}/{m.__name__}')) for s, m in Models ]) Index.set_db_drivers([ FileIndexDbDriver(YamlSerializer(), m, Path(f'{data_path}/{m.__name__}.yaml')) for m in Indexes ]) else: import redis db = redis.Redis(host='localhost', port=6379, db=0) Model.set_db_drivers([RedisDbDriver(s, m, db) for s, m in Models]) Index.set_db_drivers([RedisIndexDbDriver(m, db) for m in Indexes]) project.create_project(settings.FLOAT_PROJECT_TOKEN, 'float project') project.create_project(settings.SAMPLES_PROJECT_TOKEN, 'samples project')
def start(self, run: 'Run'): self.api_caller.add_handler( ApiUrlHandler(self.open_browser, 'Monitor experiment at ')) with self.lock: from labml.internal.computer.configs import computer_singleton computer_uuid = computer_singleton().uuid self.data.update( dict( name=run.name, comment=run.comment, computer=computer_uuid, python_file=run.python_file, repo_remotes=run.repo_remotes, commit=run.commit, commit_message=run.commit_message, is_dirty=run.is_dirty, start_step=run.start_step, load_run=run.load_run, tags=run.tags, notes=run.notes, )) self.api_caller.has_data(self) from labml.internal.api.logs import API_LOGS API_LOGS.set_api(self.api_caller, frequency=LOGS_FREQUENCY)
def set_token(self): from labml.internal.computer.configs import computer_singleton if not computer_singleton().web_api.is_default: return True while True: token = input( 'Enter app.labml.ai token (Go to Settings after logging into app.labml.ai):' ) if len(token) != 32: logger.log("Invalid token", Text.danger) break computer_singleton().set_token(token)
def __init__(self, session_uuid: str): api_caller = ApiCaller(computer_singleton().web_api.url, {'computer_uuid': computer_singleton().uuid, 'session_uuid': session_uuid}, timeout_seconds=15, daemon=True) self.writer = Writer(api_caller, frequency=computer_singleton().web_api.frequency) self.header = Header(api_caller, frequency=computer_singleton().web_api.frequency, open_browser=computer_singleton().web_api.open_browser) self.data = {} self.cache = {} self.nvml = None self.n_gpu = 0 try: from py3nvml import py3nvml as nvml self.nvml = nvml except ImportError: labml_notice('Install py3nvml to monitor GPUs:\n pip install py3nvml', is_warn=False)
def __init__(self, path: Path): self.uuid = str(path.stem) self.path = path from labml.internal.computer.configs import computer_singleton self.cache_path = computer_singleton().runs_cache / self.uuid self.complete = False self.size = 0 self.size_tensorboard = 0 self.size_checkpoints = 0 self.load_cache() if not self.complete: self.scan()
def get_running_process(): pid_file = computer_singleton().config_folder / 'monitor.pid' if not pid_file.exists(): return 0 with open(str(pid_file), 'r') as f: pid = f.read() try: pid = int(pid) except ValueError: return 0 if is_pid_running(pid): return pid else: return 0
def _test(): from labml.internal.computer.configs import computer_singleton from labml import lab import time tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir) # for k, v in os.environ.items(): # print(k, v) res = tb.start([ lab.get_path() / 'logs' / 'sample' / '9f7970d6a98611ebbc6bacde48001122', ]) print(res) time.sleep(100)
def _test(): from labml.internal.computer.configs import computer_singleton from labml import lab from labml.internal.lab import lab_singleton import time lab_singleton().set_path( str(Path(os.path.abspath(__file__)).parent.parent.parent.parent)) tb = TensorBoardStarter(computer_singleton().tensorboard_symlink_dir) # for k, v in os.environ.items(): # print(k, v) res = tb.start([ lab.get_path() / 'logs' / 'sample' / '68233e98cb5311eb9aa38d17b08f3a1d', ]) print(res) time.sleep(100)
from typing import List from labml.internal.computer.configs import computer_singleton from labml.internal.computer.projects.sync import SyncRuns from labml.internal.manage import runs as manage_runs from labml.internal.manage.tensorboard import TensorBoardStarter SYNC_RUNS = SyncRuns() TENSORBOARD_STARTER = TensorBoardStarter( computer_singleton().tensorboard_symlink_dir, computer_singleton().tensorboard_port, computer_singleton().tensorboard_visible_port, computer_singleton().tensorboard_protocol, computer_singleton().tensorboard_host, ) def start_tensorboard(*, runs: List[str]): paths = [r.path for r in SYNC_RUNS.get_runs(runs)] ret, msg = TENSORBOARD_STARTER.start(paths) if ret: return 'success', { 'url': TENSORBOARD_STARTER.url, 'message': msg, } else: return 'fail', { 'message': msg, }
def _monitor(): from labml.internal.computer import process from labml.internal.computer.configs import computer_singleton process.run(True, computer_singleton().web_api.open_browser)
def __init__(self): from labml.internal.computer.configs import computer_singleton self.home = computer_singleton().home self.service_path = computer_singleton( ).home / '.config' / 'systemd' / 'user' / 'labml.service'
def load(self): from labml.internal.computer.configs import computer_singleton self.projects = [Project(Path(p)) for p in computer_singleton().get_projects()]