def process(self, batch: any, state: any): device = self.discriminator.device data, target = batch data, target = data.to(device), target.to(device) with monit.section("generator"): latent = torch.normal(0, 1, (data.shape[0], 100), device=device) if MODE_STATE.is_train: self.generator_optimizer.zero_grad() logits = self.discriminator(self.generator(latent)) loss = self.generator_loss(logits) tracker.add("loss.generator.", loss) if MODE_STATE.is_train: loss.backward() self.generator_optimizer.step() with monit.section("discriminator"): latent = torch.normal(0, 1, (data.shape[0], 100), device=device) if MODE_STATE.is_train: self.discriminator_optimizer.zero_grad() logits_false = self.discriminator(self.generator(latent).detach()) logits_true = self.discriminator(data) loss = self.discriminator_loss(logits_true, logits_false) tracker.add("loss.generator.", loss) if MODE_STATE.is_train: loss.backward() self.discriminator_optimizer.step() return {}, None
def step(self, batch: Any, batch_idx: BatchIndex): self.model.train(self.mode.is_train) data, target = batch[0].to(self.device), batch[1].to(self.device) if self.mode.is_train: tracker.add_global_step(len(data)) is_log_activations = batch_idx.is_interval( self.log_activations_batches) with monit.section("model"): with self.mode.update(is_log_activations=is_log_activations): output = self.model(data) loss = self.loss_func(output, target) tracker.add("loss.", loss) if self.mode.is_train: with monit.section('backward'): loss.backward() if batch_idx.is_interval(self.update_batches): with monit.section('optimize'): self.optimizer.step() if batch_idx.is_interval(self.log_params_updates): tracker.add('model', self.model) self.optimizer.zero_grad() if batch_idx.is_interval(self.log_save_batches): tracker.save()
def run(command: List[str]): client = connect() _, home_path = execute(client, 'pwd') with monit.section("Setup server"): logger.log() if setup_server(client, home_path) != 0: monit.fail() fail("Failed to setup server") return logger.log() with monit.section("RSync"): logger.log() if rsync_project() != 0: monit.fail() fail("Failed to run rsync") return logger.log() with monit.section("Update python packages"): logger.log() if update_packages(client, home_path) != 0: monit.fail() fail("Failed to update packages") return logger.log('\n\n' + '-' * 40 + '\n\n') with monit.section("Run command"): logger.log() if run_command(client, home_path, command) != 0: monit.fail() fail("Failed to run command") return
def get_next_word(self, prompt: torch.Tensor, state: Any, rest: str, probs: List[float], prediction_complete: PredictionComplete, max_beam_size: int) -> \ List[Prediction]: beam = BeamSearchSimple(beam_size=prompt.shape[1], prediction_complete=prediction_complete, max_beam_size=max_beam_size, rest=rest, state_updater=self.state_updater, probs=probs, is_token_by_token=self.is_token_by_token, itos=self.tokenizer.itos) for _ in range(10): with monit.section('Predict', is_silent=True): next_token, new_state = self._get_predictions(prompt, state) with monit.section('Beam', is_silent=True): beam.update(next_token, new_state, state) prompt, state = beam.next_batch(prompt, new_state) if prompt is None: break results = [Prediction(r[0], r[1][0], r[1][1]) for r in beam.result_heap] return results
def main(): lstm_size = 1024 lstm_layers = 3 with monit.section("Loading data"): files = parser.load.load_files() train_files, valid_files = parser.load.split_train_valid( files, is_shuffle=False) with monit.section("Create model"): model = SimpleLstmModel(encoding_size=tokenizer.VOCAB_SIZE, embedding_size=tokenizer.VOCAB_SIZE, lstm_size=lstm_size, lstm_layers=lstm_layers) model.to(device) experiment.add_pytorch_models({'base': model}) experiment.load("94ab8470e6a711ea9703c1dbf199539e", 5654) # For debugging with a specific piece of source code # predictor = Predictor(model, lstm_layers, lstm_size) # for s in ['""" """\n', "from __future__"]: # predictor.add(s) # s = predictor.get_suggestion() # Evaluate all the files in validation set for file in valid_files: logger.log(str(file.path), Text.heading) evaluator = Evaluator(model, file, lstm_layers, lstm_size, skip_spaces=True) evaluator.eval()
def step(self, batch: Any, batch_idx: BatchIndex): self.encoder.train(self.mode.is_train) self.decoder.train(self.mode.is_train) # Move `data` and `mask` to device and swap the sequence and batch dimensions. # `data` will have shape `[seq_len, batch_size, 5]` and # `mask` will have shape `[seq_len, batch_size]`. data = batch[0].to(self.device).transpose(0, 1) mask = batch[1].to(self.device).transpose(0, 1) # Increment step in training mode if self.mode.is_train: tracker.add_global_step(len(data)) # Encode the sequence of strokes with monit.section("encoder"): # Get $z$, $\mu$, and $\hat{\sigma}$ z, mu, sigma_hat = self.encoder(data) # Decode the mixture of distributions and $\hat{q}$ with monit.section("decoder"): # Concatenate $[(\Delta x, \Delta y, p_1, p_2, p_3); z]$ z_stack = z.unsqueeze(0).expand(data.shape[0] - 1, -1, -1) inputs = torch.cat([data[:-1], z_stack], 2) # Get mixture of distributions and $\hat{q}$ dist, q_logits, _ = self.decoder(inputs, z, None) # Compute the loss with monit.section('loss'): # $L_{KL}$ kl_loss = self.kl_div_loss(sigma_hat, mu) # $L_R$ reconstruction_loss = self.reconstruction_loss(mask, data[1:], dist, q_logits) # $Loss = L_R + w_{KL} L_{KL}$ loss = reconstruction_loss + self.kl_div_loss_weight * kl_loss # Track losses tracker.add("loss.kl.", kl_loss) tracker.add("loss.reconstruction.", reconstruction_loss) tracker.add("loss.total.", loss) # Only if we are in training state if self.mode.is_train: # Run optimizer with monit.section('optimize'): # Set `grad` to zero self.optimizer.zero_grad() # Compute gradients loss.backward() # Log model parameters and gradients if batch_idx.is_last: tracker.add(encoder=self.encoder, decoder=self.decoder) # Clip gradients nn.utils.clip_grad_norm_(self.encoder.parameters(), self.grad_clip) nn.utils.clip_grad_norm_(self.decoder.parameters(), self.grad_clip) # Optimize self.optimizer.step() tracker.save()
def build_index(conf: Configs, n_centeroids: int = 2048, code_size: int = 64, n_probe: int = 8, n_train: int = 200_000): """ ## Build FAISS index [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started), [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search), and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint) tutorials on FAISS will help you learn more about FAISS usage. """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Build an index with Verenoi cell based faster search with compression that # doesn't store full vectors. quantizer = faiss.IndexFlatL2(d_model) index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8) index.nprobe = n_probe # Load the memory mapped numpy array of keys keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='r', shape=(n_keys, d_model)) # Pick a random sample of keys to train the index with random_sample = np.random.choice(np.arange(n_keys), size=[min(n_train, n_keys)], replace=False) with monit.section('Train index'): # Train the index to store the keys index.train(keys_store[random_sample]) # Add keys to the index; $\big(f(c_i), i\big)$ for s in monit.iterate('Index', range(0, n_keys, 1024)): e = min(s + 1024, n_keys) # $f(c_i)$ keys = keys_store[s:e] # $i$ idx = np.arange(s, e) # Add to index index.add_with_ids(keys, idx) with monit.section('Save'): # Save the index faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
def process(self, batch: any, state: any): device = self.discriminator.device data, target = batch data, target = data.to(device), target.to(device) # Train the discriminator with monit.section("discriminator"): for _ in range(self.discriminator_k): latent = torch.randn(data.shape[0], 100, device=device) if MODE_STATE.is_train: self.discriminator_optimizer.zero_grad() logits_true = self.discriminator(data) logits_false = self.discriminator( self.generator(latent).detach()) loss_true, loss_false = self.discriminator_loss( logits_true, logits_false) loss = loss_true + loss_false # Log stuff tracker.add("loss.discriminator.true.", loss_true) tracker.add("loss.discriminator.false.", loss_false) tracker.add("loss.discriminator.", loss) # Train if MODE_STATE.is_train: loss.backward() if MODE_STATE.is_log_parameters: pytorch_utils.store_model_indicators( self.discriminator, 'discriminator') self.discriminator_optimizer.step() # Train the generator with monit.section("generator"): latent = torch.randn(data.shape[0], 100, device=device) if MODE_STATE.is_train: self.generator_optimizer.zero_grad() generated_images = self.generator(latent) logits = self.discriminator(generated_images) loss = self.generator_loss(logits) # Log stuff tracker.add('generated', generated_images[0:5]) tracker.add("loss.generator.", loss) # Train if MODE_STATE.is_train: loss.backward() if MODE_STATE.is_log_parameters: pytorch_utils.store_model_indicators( self.generator, 'generator') self.generator_optimizer.step() return {'samples': len(data)}, None
def step(self, batch: Any, batch_idx: BatchIndex): self.generator.train(self.mode.is_train) self.discriminator.train(self.mode.is_train) data, target = batch[0].to(self.device), batch[1].to(self.device) # Increment step in training mode if self.mode.is_train: tracker.add_global_step(len(data)) # Train the discriminator with monit.section("discriminator"): for _ in range(self.discriminator_k): latent = torch.randn(data.shape[0], 100, device=self.device) logits_true = self.discriminator(data) logits_false = self.discriminator(self.generator(latent).detach()) loss_true, loss_false = self.discriminator_loss(logits_true, logits_false) loss = loss_true + loss_false # Log stuff tracker.add("loss.discriminator.true.", loss_true) tracker.add("loss.discriminator.false.", loss_false) tracker.add("loss.discriminator.", loss) # Train if self.mode.is_train: self.discriminator_optimizer.zero_grad() loss.backward() if batch_idx.is_last: tracker.add('discriminator', self.discriminator) self.discriminator_optimizer.step() # Train the generator with monit.section("generator"): latent = torch.randn(data.shape[0], 100, device=self.device) generated_images = self.generator(latent) logits = self.discriminator(generated_images) loss = self.generator_loss(logits) # Log stuff tracker.add('generated', generated_images[0:5]) tracker.add("loss.generator.", loss) # Train if self.mode.is_train: self.generator_optimizer.zero_grad() loss.backward() if batch_idx.is_last: tracker.add('generator', self.generator) self.generator_optimizer.step() tracker.save()
def __init__(self, include_edges: bool = True): """ Load the dataset """ # Whether to include edges. # This is test how much accuracy is lost if we ignore the citation network. self.include_edges = include_edges # Download dataset self._download() # Read the paper ids, feature vectors, and labels with monit.section('Read content file'): content = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.content'), dtype=np.dtype(str)) # Load the citations, it's a list of pairs of integers. with monit.section('Read citations file'): citations = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.cites'), dtype=np.int32) # Get the feature vectors features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32)) # Normalize the feature vectors self.features = features / features.sum(dim=1, keepdim=True) # Get the class names and assign an unique integer to each of them self.classes = {s: i for i, s in enumerate(set(content[:, -1]))} # Get the labels as those integers self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long) # Get the paper ids paper_ids = np.array(content[:, 0], dtype=np.int32) # Map of paper id to index ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)} # Empty adjacency matrix - an identity matrix self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool) # Mark the citations in the adjacency matrix if self.include_edges: for e in citations: # The pair of paper indexes e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]] # We build a symmetrical graph, where if paper $i$ referenced # paper $j$ we place an adge from $i$ to $j$ as well as an edge # from $j$ to $i$. self.adj_mat[e1][e2] = True self.adj_mat[e2][e1] = True
def __init__(self, validation_dates: int, skip_cache: bool = False): self.validation_dates = validation_dates dates_cache_path = lab.get_data_path() / 'dates.npy' packets_cache_path = lab.get_data_path() / 'packets.npy' if skip_cache or not dates_cache_path.exists( ) or not packets_cache_path.exists(): with monit.section('Build cache'): build_cache() with monit.section("Cache"): self.dates = np.load(str(dates_cache_path)) self.packets = torch.tensor(np.load(str(packets_cache_path)), dtype=torch.float)
def build_cache(): data_path = lab.get_data_path() with monit.section("Read data"): df = pd.read_csv(str(data_path / "IBM_unadjusted.txt")) df = parse(df) with monit.section("Filter pre-market data"): df = filter_premarket(df) with monit.section("To numpy"): dates, packets = to_numpy(df) with monit.section("Save"): np.save(str(data_path / "packets.npy"), packets) np.save(str(data_path / "dates.npy"), dates)
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]: if url: download_file(url, path) if not path.exists(): raise FileNotFoundError(f'Bundle archive missing: {path}') with monit.section('Extract bundle'): with tarfile.open(str(path), 'r:gz') as tar: files = tar.getmembers() info_member = None for f in files: if f.name == 'info.json': info_member = f if not info_member: raise RuntimeError(f"Corrupted bundle. Missing info.json") with tar.extractfile(info_member) as ef: info = json.load(ef) run_uuid, checkpoint = info['uuid'], info['checkpoint'] run_path = get_run_by_uuid(run_uuid) if run_path is not None: logger.log(f"Run {run_uuid} exists", Text.meta) current_checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint == current_checkpoint: logger.log(f"Checkpoint {checkpoint} exists", Text.meta) return run_uuid, checkpoint run_path = lab.get_experiments_path() / 'bundled' / run_uuid checkpoint_path = run_path / "checkpoints" / str(checkpoint) if not checkpoint_path.exists(): checkpoint_path.mkdir(parents=True) data_path = lab.get_data_path() if not data_path.exists(): data_path.mkdir(parents=True) for f in files: if f.name == 'run.yaml': _extract_tar_file(tar, f, run_path / 'run.yaml') elif f.name == 'configs.yaml': _extract_tar_file(tar, f, run_path / 'configs.yaml') elif f.name.startswith('checkpoint/'): p = f.name[len('checkpoint/'):] p = checkpoint_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) elif f.name.startswith('data/'): p = f.name[len('data/'):] p = data_path / p if not p.parent.exists(): p.parent.mkdir(parents=True) _extract_tar_file(tar, f, p) return run_uuid, checkpoint
def main(local_rank, rank, world_size, uuid, init_method: str = 'tcp://localhost:23456'): with monit.section('Distributed'): torch.distributed.init_process_group( "gloo", timeout=datetime.timedelta(seconds=30), init_method=init_method, rank=rank, world_size=world_size) conf = Configs() experiment.create(uuid=uuid, name="source_code_ddp", comment='lstm model') experiment.distributed(local_rank, world_size) experiment.configs( conf, { 'model': 'transformer_model', 'n_layers': 6, 'batch_size': 12, 'epochs': 32, 'optimizer.optimizer': 'Noam', 'optimizer.learning_rate': 1.0, 'device.cuda_device': local_rank, 'seq_len': 512, 'train_loader': 'shuffled_train_loader', 'valid_loader': 'shuffled_valid_loader' }) experiment.add_pytorch_models(model=conf.ddp_model) with experiment.start(): conf.run()
def client(self) -> SSHClient: if self.__client is None: self.__client = SSHClient() self.__client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) with monit.section(f'Connecting to {self.conf.hostname}'): # self.__client.connect(hostname=self.conf.hostname, # username=self.conf.username, # pkey=self.conf.private_key, # password=self.conf.password) self.__client.connect( hostname='alya.inrialpes.fr', username='******', port=22, sock=paramiko.ProxyCommand( command_line= 'ssh -W %h:%p [email protected]'), banner_timeout=20) # sock=paramiko.ProxyCommand(command_line='ssh -q -W %h:%p "\ # "*****@*****.**'), # self.__client.connect(hostname='localhost', # username='******', # port=22, # sock=paramiko.ProxyCommand(command_line="ssh -q -W %h:%p "\ # "gaetan@localhost")) return self.__client
def command(self, command: str, env_vars: Dict[str, str], *, ui_mode: UIMode = UIMode.dots, is_background: bool, is_eval: bool): _ = self.client with monit.section("Run command"): logger.log() pipfile = Path('.') / 'Pipfile' # requirements = Path('.') / 'requirements.txt' script = self.template_script( 'run.sh', { 'use_pipenv': str(pipfile.exists()), 'run_command': command, 'environment_variables': get_env_vars(env_vars) }) res = self.copy_and_run_script(script, 'run.sh', ui_mode=ui_mode, is_background=is_background, is_eval=is_eval) if res.exit_code != 0: raise RemoteError("Failed to run command") return res
def rsync_jobs(self, *, ui_mode: UIMode = UIMode.dots, is_silent=False): with monit.section(f"RSync {self.conf.name} jobs", is_silent=is_silent): if not is_silent: logger.log() rsync_cmd = ['rsync', '-zravuKLt', '--executability'] if self.conf.private_key_file is not None: rsync_cmd += [ '-e', f'"ssh -o StrictHostKeyChecking=no -i {self.conf.private_key_file}"' ] else: rsync_cmd += ['-e', '"ssh -o StrictHostKeyChecking=no"'] rsync_cmd += [ f'{self.conf.username}@{self.conf.hostname}:' f'~/{self.project_name}/{Configs.get().remote_jobs_folder_name}/' ] rsync_cmd += [str(Configs.get().project_jobs_folder)] log_dir = self._get_log_folder(f'rsync_jobs_{self.conf.name}') exit_code = self.local_exec.stream(' '.join(rsync_cmd), log_dir=log_dir, ui_mode=ui_mode) if exit_code != 0: raise RemoteError("Failed to run rsync")
def section_silent(): arr = torch.zeros((1000, 1000)) for i in range(N): with monit.section('run', is_silent=True): for t in range(10): arr += 1
def __load_configs(self): config_file = self.home / _CONFIG_FILE_NAME if config_file.exists(): with open(str(config_file)) as f: config = util.yaml_load(f.read()) if config is None: config = {} else: with monit.section('Creating a .labml config'): from uuid import uuid1 config = {'uuid': uuid1().hex} with open(str(config_file), 'w') as f: f.write(util.yaml_dump(config)) default_config = self.__default_config() for k, v in default_config.items(): if k not in config: config[k] = v self.uuid = config['uuid'] web_api_url = config['web_api'] if web_api_url[0:4] != 'http': web_api_url = f"https://api.lab-ml.com/api/v1/computer?labml_token={web_api_url}&" self.web_api = WebAPIConfigs( url=web_api_url, frequency=config['web_api_frequency'], verify_connection=config['web_api_verify_connection'], open_browser=config['web_api_open_browser'])
def run(): pid = get_running_process() if pid: raise RuntimeError( f'This computer is already being monitored. PID: {pid}') from uuid import uuid1 session_uuid = uuid1().hex with open(str(computer_singleton().config_folder / 'session.txt'), 'w') as f: f.write(session_uuid) with open(str(computer_singleton().config_folder / 'monitor.pid'), 'w') as f: f.write(str(os.getpid())) m = monitor.MonitorComputer(session_uuid) m.start({ 'os': monitor.get_os(), 'cpu.logical': psutil.cpu_count(), 'cpu.physical': psutil.cpu_count(logical=False) }) while True: with monit.section('Track'): m.track() time.sleep(60)
def save_bundle(path: Path, run_uuid: str, checkpoint: int = -1, *, data_files: List[str]): run_path = get_run_by_uuid(run_uuid) if run_path is None: raise RuntimeError(f"Couldn't find run {run_uuid}") checkpoint = _get_run_checkpoint(run_path, checkpoint) if checkpoint is None: raise RuntimeError(f"Couldn't find checkpoint {run_uuid}:{checkpoint}") info_path = path.parent / f'{path.stem}.info.json' info = {'uuid': run_uuid, 'checkpoint': checkpoint} with open(str(info_path), 'w') as f: f.write(json.dumps(info)) checkpoint_path = run_path / "checkpoints" / str(checkpoint) with monit.section('Create bundle'): with tarfile.open(str(path), 'w:gz') as tar: tar.add(str(checkpoint_path), 'checkpoint') tar.add(str(run_path / 'run.yaml'), 'run.yaml') tar.add(str(run_path / 'configs.yaml'), 'configs.yaml') tar.add(str(info_path), 'info.json') for f in data_files: tar.add(str(lab.get_data_path() / f), f'data/{f}') info_path.unlink()
def load_index(conf: Configs, n_probe: int = 8): """ ## Load the index """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Load FAISS index with monit.section('Load index'): index = faiss.read_index(str(lab.get_data_path() / 'faiss.index')) # Set number of cells to probe index.nprobe = n_probe # Load memory mapped numpy arrays keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='r', shape=(n_keys, d_model)) vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'), dtype=np.int, mode='r', shape=(n_keys, 1)) return index, keys_store, vals_store
def eval(self, command: str, *, log_dir: Optional[Path], is_silent=True) -> EvalResult: with monit.section(f'Exec: {command}', is_silent=is_silent): stdin, stdout, stderr = self.client.exec_command(command) out = stdout.read() err = stderr.read() exit_code = stdout.channel.recv_exit_status() if log_dir: with open(str(log_dir / 'stdout.log'), 'wb') as f: f.write(out) with open(str(log_dir / 'stderr.log'), 'wb') as f: f.write(err) with open(str(log_dir / 'exit_code'), 'w') as f: f.write(str(exit_code)) if exit_code != 0: monit.fail() return EvalResult(exit_code, out.decode('utf-8').strip(), err.decode('utf-8').strip())
def run(self): with monit.section("Initialize"): self.init() _ = self.validator _ = self.trainer for _ in self.training_loop: self.run_step()
def section(): arr = torch.zeros((1000, 1000)) for i in range(N): with monit.section('run'): for t in range(10): arr += 1
def run(is_check_process: bool = True, open_browser: bool = True): pid = get_running_process() if is_check_process and pid: raise RuntimeError( f'This computer is already being monitored. PID: {pid}') from uuid import uuid1 session_uuid = uuid1().hex with open(str(computer_singleton().config_folder / 'session.txt'), 'w') as f: f.write(session_uuid) with open(str(computer_singleton().config_folder / 'monitor.pid'), 'w') as f: f.write(str(os.getpid())) m = monitor.MonitorComputer(session_uuid, open_browser) m.start() i = 0 while True: with monit.section('Track', is_new_line=False): m.track() time.sleep(min(60.0, max(1.0, i / 5.0))) i += 1
def download_repo(org: str, repo: str, idx: Optional[int]): zip_file = Path(lab.get_data_path() / 'download' / f'{org}_{repo}.zip') if zip_file.exists(): return zip_file if idx is not None: idx_str = f"{idx:03}: " else: idx_str = "" with monit.section(f"{idx_str} {org}/{repo}") as s: try: zip = urllib.request.urlopen( f'https://github.com/{org}/{repo}/archive/master.zip') except urllib.error.HTTPError as e: print(e) return content = zip.read() size = len(content) // 1024 s.message = f"{size :,}KB" with open(str(zip_file), 'wb') as f: f.write(content) return zip_file
def main(): predictor = get_predictor() with open(str(lab.get_data_path() / 'sample.py'), 'r') as f: sample = f.read() with monit.section('Evaluate'): evaluate(predictor, sample)
def main(local_rank, rank, world_size, uuid, init_method: str = 'tcp://localhost:23456'): with monit.section('Distributed'): torch.distributed.init_process_group( "gloo", timeout=datetime.timedelta(seconds=30), init_method=init_method, rank=rank, world_size=world_size) conf = Configs() experiment.create(uuid=uuid, name='mnist ddp') experiment.distributed(local_rank, world_size) experiment.configs( conf, { 'optimizer.optimizer': 'Adam', 'optimizer.learning_rate': 1e-4, 'model': 'ddp_model', 'device.cuda_device': local_rank }) conf.set_seed.set() experiment.add_pytorch_models(dict(model=conf.model)) with experiment.start(): conf.run()
def rsync(self, *, ui_mode: UIMode = UIMode.dots): with monit.section(f"RSync {self.conf.name}"): logger.log() exclude_path = Configs.get().exclude_file exclude_path = exclude_path.absolute() # z = compress # r = recursive # a = equivalent to (-rlptgoD) archive (recursive/preserve everything) # v = verbose # u = update (skip whats newer on receiver) # K = keep symlinks # L = transform links to dir # t = preserve modification times # l = copy links # p = preserve permissions # g = preserve group # o = preserve owner # D = preserve device files rsync_cmd = ['rsync', '-zravuKLt', '--executability'] if self.conf.private_key_file is not None: rsync_cmd += ['-e', f'"ssh -o StrictHostKeyChecking=no -i {self.conf.private_key_file}"'] else: rsync_cmd += ['-e', f'"ssh -o StrictHostKeyChecking=no"'] if exclude_path.exists(): rsync_cmd += [f"--exclude-from='{str(exclude_path)}'"] rsync_cmd += ['./'] # source rsync_cmd += [f'{self.conf.username}@{self.conf.hostname}:~/{self.project_name}/'] # destination log_dir = self._get_log_folder(f'rsync_{self.conf.name}') exit_code = self.local_exec.stream(' '.join(rsync_cmd), log_dir=log_dir, ui_mode=ui_mode) if exit_code != 0: raise RemoteError("Failed to run rsync")