示例#1
0
    def process(self, batch: any, state: any):
        device = self.discriminator.device
        data, target = batch
        data, target = data.to(device), target.to(device)

        with monit.section("generator"):
            latent = torch.normal(0, 1, (data.shape[0], 100), device=device)
            if MODE_STATE.is_train:
                self.generator_optimizer.zero_grad()
            logits = self.discriminator(self.generator(latent))
            loss = self.generator_loss(logits)
            tracker.add("loss.generator.", loss)
            if MODE_STATE.is_train:
                loss.backward()
                self.generator_optimizer.step()

        with monit.section("discriminator"):
            latent = torch.normal(0, 1, (data.shape[0], 100), device=device)
            if MODE_STATE.is_train:
                self.discriminator_optimizer.zero_grad()
            logits_false = self.discriminator(self.generator(latent).detach())
            logits_true = self.discriminator(data)
            loss = self.discriminator_loss(logits_true, logits_false)
            tracker.add("loss.generator.", loss)
            if MODE_STATE.is_train:
                loss.backward()
                self.discriminator_optimizer.step()

        return {}, None
示例#2
0
    def step(self, batch: Any, batch_idx: BatchIndex):
        self.model.train(self.mode.is_train)
        data, target = batch[0].to(self.device), batch[1].to(self.device)

        if self.mode.is_train:
            tracker.add_global_step(len(data))

        is_log_activations = batch_idx.is_interval(
            self.log_activations_batches)
        with monit.section("model"):
            with self.mode.update(is_log_activations=is_log_activations):
                output = self.model(data)

        loss = self.loss_func(output, target)
        tracker.add("loss.", loss)

        if self.mode.is_train:
            with monit.section('backward'):
                loss.backward()

            if batch_idx.is_interval(self.update_batches):
                with monit.section('optimize'):
                    self.optimizer.step()
                if batch_idx.is_interval(self.log_params_updates):
                    tracker.add('model', self.model)
                self.optimizer.zero_grad()

            if batch_idx.is_interval(self.log_save_batches):
                tracker.save()
示例#3
0
def run(command: List[str]):
    client = connect()
    _, home_path = execute(client, 'pwd')

    with monit.section("Setup server"):
        logger.log()
        if setup_server(client, home_path) != 0:
            monit.fail()
            fail("Failed to setup server")
            return

    logger.log()
    with monit.section("RSync"):
        logger.log()
        if rsync_project() != 0:
            monit.fail()
            fail("Failed to run rsync")
            return

    logger.log()
    with monit.section("Update python packages"):
        logger.log()
        if update_packages(client, home_path) != 0:
            monit.fail()
            fail("Failed to update packages")
            return

    logger.log('\n\n' + '-' * 40 + '\n\n')

    with monit.section("Run command"):
        logger.log()
        if run_command(client, home_path, command) != 0:
            monit.fail()
            fail("Failed to run command")
            return
示例#4
0
    def get_next_word(self, prompt: torch.Tensor, state: Any, rest: str, probs: List[float],
                      prediction_complete: PredictionComplete,
                      max_beam_size: int) -> \
            List[Prediction]:
        beam = BeamSearchSimple(beam_size=prompt.shape[1],
                                prediction_complete=prediction_complete,
                                max_beam_size=max_beam_size,
                                rest=rest,
                                state_updater=self.state_updater,
                                probs=probs,
                                is_token_by_token=self.is_token_by_token,
                                itos=self.tokenizer.itos)

        for _ in range(10):
            with monit.section('Predict', is_silent=True):
                next_token, new_state = self._get_predictions(prompt, state)
            with monit.section('Beam', is_silent=True):
                beam.update(next_token, new_state, state)
                prompt, state = beam.next_batch(prompt, new_state)

            if prompt is None:
                break

        results = [Prediction(r[0], r[1][0], r[1][1]) for r in beam.result_heap]
        return results
示例#5
0
def main():
    lstm_size = 1024
    lstm_layers = 3

    with monit.section("Loading data"):
        files = parser.load.load_files()
        train_files, valid_files = parser.load.split_train_valid(
            files, is_shuffle=False)

    with monit.section("Create model"):
        model = SimpleLstmModel(encoding_size=tokenizer.VOCAB_SIZE,
                                embedding_size=tokenizer.VOCAB_SIZE,
                                lstm_size=lstm_size,
                                lstm_layers=lstm_layers)
        model.to(device)

    experiment.add_pytorch_models({'base': model})

    experiment.load("94ab8470e6a711ea9703c1dbf199539e", 5654)

    # For debugging with a specific piece of source code
    # predictor = Predictor(model, lstm_layers, lstm_size)
    # for s in ['""" """\n', "from __future__"]:
    #     predictor.add(s)
    # s = predictor.get_suggestion()

    # Evaluate all the files in validation set
    for file in valid_files:
        logger.log(str(file.path), Text.heading)
        evaluator = Evaluator(model,
                              file,
                              lstm_layers,
                              lstm_size,
                              skip_spaces=True)
        evaluator.eval()
示例#6
0
文件: __init__.py 项目: wx-b/nn
    def step(self, batch: Any, batch_idx: BatchIndex):
        self.encoder.train(self.mode.is_train)
        self.decoder.train(self.mode.is_train)

        # Move `data` and `mask` to device and swap the sequence and batch dimensions.
        # `data` will have shape `[seq_len, batch_size, 5]` and
        # `mask` will have shape `[seq_len, batch_size]`.
        data = batch[0].to(self.device).transpose(0, 1)
        mask = batch[1].to(self.device).transpose(0, 1)

        # Increment step in training mode
        if self.mode.is_train:
            tracker.add_global_step(len(data))

        # Encode the sequence of strokes
        with monit.section("encoder"):
            # Get $z$, $\mu$, and $\hat{\sigma}$
            z, mu, sigma_hat = self.encoder(data)

        # Decode the mixture of distributions and $\hat{q}$
        with monit.section("decoder"):
            # Concatenate $[(\Delta x, \Delta y, p_1, p_2, p_3); z]$
            z_stack = z.unsqueeze(0).expand(data.shape[0] - 1, -1, -1)
            inputs = torch.cat([data[:-1], z_stack], 2)
            # Get mixture of distributions and $\hat{q}$
            dist, q_logits, _ = self.decoder(inputs, z, None)

        # Compute the loss
        with monit.section('loss'):
            # $L_{KL}$
            kl_loss = self.kl_div_loss(sigma_hat, mu)
            # $L_R$
            reconstruction_loss = self.reconstruction_loss(mask, data[1:], dist, q_logits)
            # $Loss = L_R + w_{KL} L_{KL}$
            loss = reconstruction_loss + self.kl_div_loss_weight * kl_loss

            # Track losses
            tracker.add("loss.kl.", kl_loss)
            tracker.add("loss.reconstruction.", reconstruction_loss)
            tracker.add("loss.total.", loss)

        # Only if we are in training state
        if self.mode.is_train:
            # Run optimizer
            with monit.section('optimize'):
                # Set `grad` to zero
                self.optimizer.zero_grad()
                # Compute gradients
                loss.backward()
                # Log model parameters and gradients
                if batch_idx.is_last:
                    tracker.add(encoder=self.encoder, decoder=self.decoder)
                # Clip gradients
                nn.utils.clip_grad_norm_(self.encoder.parameters(), self.grad_clip)
                nn.utils.clip_grad_norm_(self.decoder.parameters(), self.grad_clip)
                # Optimize
                self.optimizer.step()

        tracker.save()
示例#7
0
def build_index(conf: Configs,
                n_centeroids: int = 2048,
                code_size: int = 64,
                n_probe: int = 8,
                n_train: int = 200_000):
    """
    ## Build FAISS index

    [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started),
    [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search),
    and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint)
    tutorials on FAISS will help you learn more about FAISS usage.
    """
    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1

    # Build an index with Verenoi cell based faster search with compression that
    # doesn't store full vectors.
    quantizer = faiss.IndexFlatL2(d_model)
    index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8)
    index.nprobe = n_probe

    # Load the memory mapped numpy array of keys
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='r',
                           shape=(n_keys, d_model))

    # Pick a random sample of keys to train the index with
    random_sample = np.random.choice(np.arange(n_keys),
                                     size=[min(n_train, n_keys)],
                                     replace=False)

    with monit.section('Train index'):
        # Train the index to store the keys
        index.train(keys_store[random_sample])

    # Add keys to the index; $\big(f(c_i), i\big)$
    for s in monit.iterate('Index', range(0, n_keys, 1024)):
        e = min(s + 1024, n_keys)
        # $f(c_i)$
        keys = keys_store[s:e]
        # $i$
        idx = np.arange(s, e)
        # Add to index
        index.add_with_ids(keys, idx)

    with monit.section('Save'):
        # Save the index
        faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
示例#8
0
    def process(self, batch: any, state: any):
        device = self.discriminator.device
        data, target = batch
        data, target = data.to(device), target.to(device)

        # Train the discriminator
        with monit.section("discriminator"):
            for _ in range(self.discriminator_k):
                latent = torch.randn(data.shape[0], 100, device=device)
                if MODE_STATE.is_train:
                    self.discriminator_optimizer.zero_grad()
                logits_true = self.discriminator(data)
                logits_false = self.discriminator(
                    self.generator(latent).detach())
                loss_true, loss_false = self.discriminator_loss(
                    logits_true, logits_false)
                loss = loss_true + loss_false

                # Log stuff
                tracker.add("loss.discriminator.true.", loss_true)
                tracker.add("loss.discriminator.false.", loss_false)
                tracker.add("loss.discriminator.", loss)

                # Train
                if MODE_STATE.is_train:
                    loss.backward()
                    if MODE_STATE.is_log_parameters:
                        pytorch_utils.store_model_indicators(
                            self.discriminator, 'discriminator')
                    self.discriminator_optimizer.step()

        # Train the generator
        with monit.section("generator"):
            latent = torch.randn(data.shape[0], 100, device=device)
            if MODE_STATE.is_train:
                self.generator_optimizer.zero_grad()
            generated_images = self.generator(latent)
            logits = self.discriminator(generated_images)
            loss = self.generator_loss(logits)

            # Log stuff
            tracker.add('generated', generated_images[0:5])
            tracker.add("loss.generator.", loss)

            # Train
            if MODE_STATE.is_train:
                loss.backward()
                if MODE_STATE.is_log_parameters:
                    pytorch_utils.store_model_indicators(
                        self.generator, 'generator')
                self.generator_optimizer.step()

        return {'samples': len(data)}, None
示例#9
0
    def step(self, batch: Any, batch_idx: BatchIndex):
        self.generator.train(self.mode.is_train)
        self.discriminator.train(self.mode.is_train)

        data, target = batch[0].to(self.device), batch[1].to(self.device)

        # Increment step in training mode
        if self.mode.is_train:
            tracker.add_global_step(len(data))

        # Train the discriminator
        with monit.section("discriminator"):
            for _ in range(self.discriminator_k):
                latent = torch.randn(data.shape[0], 100, device=self.device)
                logits_true = self.discriminator(data)
                logits_false = self.discriminator(self.generator(latent).detach())
                loss_true, loss_false = self.discriminator_loss(logits_true, logits_false)
                loss = loss_true + loss_false

                # Log stuff
                tracker.add("loss.discriminator.true.", loss_true)
                tracker.add("loss.discriminator.false.", loss_false)
                tracker.add("loss.discriminator.", loss)

                # Train
                if self.mode.is_train:
                    self.discriminator_optimizer.zero_grad()
                    loss.backward()
                    if batch_idx.is_last:
                        tracker.add('discriminator', self.discriminator)
                    self.discriminator_optimizer.step()

        # Train the generator
        with monit.section("generator"):
            latent = torch.randn(data.shape[0], 100, device=self.device)
            generated_images = self.generator(latent)
            logits = self.discriminator(generated_images)
            loss = self.generator_loss(logits)

            # Log stuff
            tracker.add('generated', generated_images[0:5])
            tracker.add("loss.generator.", loss)

            # Train
            if self.mode.is_train:
                self.generator_optimizer.zero_grad()
                loss.backward()
                if batch_idx.is_last:
                    tracker.add('generator', self.generator)
                self.generator_optimizer.step()

        tracker.save()
示例#10
0
文件: experiment.py 项目: Hadryan/nn
    def __init__(self, include_edges: bool = True):
        """
        Load the dataset
        """

        # Whether to include edges.
        # This is test how much accuracy is lost if we ignore the citation network.
        self.include_edges = include_edges

        # Download dataset
        self._download()

        # Read the paper ids, feature vectors, and labels
        with monit.section('Read content file'):
            content = np.genfromtxt(str(lab.get_data_path() /
                                        'cora/cora.content'),
                                    dtype=np.dtype(str))
        # Load the citations, it's a list of pairs of integers.
        with monit.section('Read citations file'):
            citations = np.genfromtxt(str(lab.get_data_path() /
                                          'cora/cora.cites'),
                                      dtype=np.int32)

        # Get the feature vectors
        features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
        # Normalize the feature vectors
        self.features = features / features.sum(dim=1, keepdim=True)

        # Get the class names and assign an unique integer to each of them
        self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
        # Get the labels as those integers
        self.labels = torch.tensor([self.classes[i] for i in content[:, -1]],
                                   dtype=torch.long)

        # Get the paper ids
        paper_ids = np.array(content[:, 0], dtype=np.int32)
        # Map of paper id to index
        ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}

        # Empty adjacency matrix - an identity matrix
        self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)

        # Mark the citations in the adjacency matrix
        if self.include_edges:
            for e in citations:
                # The pair of paper indexes
                e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
                # We build a symmetrical graph, where if paper $i$ referenced
                # paper $j$ we place an adge from $i$ to $j$ as well as an edge
                # from $j$ to $i$.
                self.adj_mat[e1][e2] = True
                self.adj_mat[e2][e1] = True
示例#11
0
    def __init__(self, validation_dates: int, skip_cache: bool = False):
        self.validation_dates = validation_dates

        dates_cache_path = lab.get_data_path() / 'dates.npy'
        packets_cache_path = lab.get_data_path() / 'packets.npy'

        if skip_cache or not dates_cache_path.exists(
        ) or not packets_cache_path.exists():
            with monit.section('Build cache'):
                build_cache()

        with monit.section("Cache"):
            self.dates = np.load(str(dates_cache_path))
            self.packets = torch.tensor(np.load(str(packets_cache_path)),
                                        dtype=torch.float)
示例#12
0
def build_cache():
    data_path = lab.get_data_path()

    with monit.section("Read data"):
        df = pd.read_csv(str(data_path / "IBM_unadjusted.txt"))
    df = parse(df)
    with monit.section("Filter pre-market data"):
        df = filter_premarket(df)

    with monit.section("To numpy"):
        dates, packets = to_numpy(df)

    with monit.section("Save"):
        np.save(str(data_path / "packets.npy"), packets)
        np.save(str(data_path / "dates.npy"), dates)
示例#13
0
def load_bundle(path: Path, url: Optional[str] = None) -> Tuple[str, int]:
    if url:
        download_file(url, path)

    if not path.exists():
        raise FileNotFoundError(f'Bundle archive missing: {path}')

    with monit.section('Extract bundle'):
        with tarfile.open(str(path), 'r:gz') as tar:
            files = tar.getmembers()
            info_member = None
            for f in files:
                if f.name == 'info.json':
                    info_member = f

            if not info_member:
                raise RuntimeError(f"Corrupted bundle. Missing info.json")

            with tar.extractfile(info_member) as ef:
                info = json.load(ef)

            run_uuid, checkpoint = info['uuid'], info['checkpoint']
            run_path = get_run_by_uuid(run_uuid)

            if run_path is not None:
                logger.log(f"Run {run_uuid} exists", Text.meta)
                current_checkpoint = _get_run_checkpoint(run_path, checkpoint)
                if checkpoint == current_checkpoint:
                    logger.log(f"Checkpoint {checkpoint} exists", Text.meta)
                    return run_uuid, checkpoint

            run_path = lab.get_experiments_path() / 'bundled' / run_uuid

            checkpoint_path = run_path / "checkpoints" / str(checkpoint)
            if not checkpoint_path.exists():
                checkpoint_path.mkdir(parents=True)

            data_path = lab.get_data_path()
            if not data_path.exists():
                data_path.mkdir(parents=True)

            for f in files:
                if f.name == 'run.yaml':
                    _extract_tar_file(tar, f, run_path / 'run.yaml')
                elif f.name == 'configs.yaml':
                    _extract_tar_file(tar, f, run_path / 'configs.yaml')
                elif f.name.startswith('checkpoint/'):
                    p = f.name[len('checkpoint/'):]
                    p = checkpoint_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)
                elif f.name.startswith('data/'):
                    p = f.name[len('data/'):]
                    p = data_path / p
                    if not p.parent.exists():
                        p.parent.mkdir(parents=True)
                    _extract_tar_file(tar, f, p)

            return run_uuid, checkpoint
示例#14
0
def main(local_rank,
         rank,
         world_size,
         uuid,
         init_method: str = 'tcp://localhost:23456'):
    with monit.section('Distributed'):
        torch.distributed.init_process_group(
            "gloo",
            timeout=datetime.timedelta(seconds=30),
            init_method=init_method,
            rank=rank,
            world_size=world_size)
    conf = Configs()
    experiment.create(uuid=uuid, name="source_code_ddp", comment='lstm model')
    experiment.distributed(local_rank, world_size)
    experiment.configs(
        conf, {
            'model': 'transformer_model',
            'n_layers': 6,
            'batch_size': 12,
            'epochs': 32,
            'optimizer.optimizer': 'Noam',
            'optimizer.learning_rate': 1.0,
            'device.cuda_device': local_rank,
            'seq_len': 512,
            'train_loader': 'shuffled_train_loader',
            'valid_loader': 'shuffled_valid_loader'
        })
    experiment.add_pytorch_models(model=conf.ddp_model)
    with experiment.start():
        conf.run()
示例#15
0
 def client(self) -> SSHClient:
     if self.__client is None:
         self.__client = SSHClient()
         self.__client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
         with monit.section(f'Connecting to {self.conf.hostname}'):
             # self.__client.connect(hostname=self.conf.hostname,
             #                       username=self.conf.username,
             #                       pkey=self.conf.private_key,
             #                       password=self.conf.password)
             self.__client.connect(
                 hostname='alya.inrialpes.fr',
                 username='******',
                 port=22,
                 sock=paramiko.ProxyCommand(
                     command_line=
                     'ssh -W %h:%p [email protected]'),
                 banner_timeout=20)
             # sock=paramiko.ProxyCommand(command_line='ssh -q -W %h:%p "\
             #     "*****@*****.**'),
             # self.__client.connect(hostname='localhost',
             #                       username='******',
             #                       port=22,
             #                       sock=paramiko.ProxyCommand(command_line="ssh -q -W %h:%p "\
             #                           "gaetan@localhost"))
     return self.__client
示例#16
0
    def command(self,
                command: str,
                env_vars: Dict[str, str],
                *,
                ui_mode: UIMode = UIMode.dots,
                is_background: bool,
                is_eval: bool):
        _ = self.client
        with monit.section("Run command"):
            logger.log()
            pipfile = Path('.') / 'Pipfile'
            # requirements = Path('.') / 'requirements.txt'

            script = self.template_script(
                'run.sh', {
                    'use_pipenv': str(pipfile.exists()),
                    'run_command': command,
                    'environment_variables': get_env_vars(env_vars)
                })

            res = self.copy_and_run_script(script,
                                           'run.sh',
                                           ui_mode=ui_mode,
                                           is_background=is_background,
                                           is_eval=is_eval)

            if res.exit_code != 0:
                raise RemoteError("Failed to run command")

            return res
示例#17
0
    def rsync_jobs(self, *, ui_mode: UIMode = UIMode.dots, is_silent=False):
        with monit.section(f"RSync {self.conf.name} jobs",
                           is_silent=is_silent):
            if not is_silent:
                logger.log()
            rsync_cmd = ['rsync', '-zravuKLt', '--executability']
            if self.conf.private_key_file is not None:
                rsync_cmd += [
                    '-e',
                    f'"ssh -o StrictHostKeyChecking=no -i {self.conf.private_key_file}"'
                ]
            else:
                rsync_cmd += ['-e', '"ssh -o StrictHostKeyChecking=no"']
            rsync_cmd += [
                f'{self.conf.username}@{self.conf.hostname}:'
                f'~/{self.project_name}/{Configs.get().remote_jobs_folder_name}/'
            ]
            rsync_cmd += [str(Configs.get().project_jobs_folder)]

            log_dir = self._get_log_folder(f'rsync_jobs_{self.conf.name}')
            exit_code = self.local_exec.stream(' '.join(rsync_cmd),
                                               log_dir=log_dir,
                                               ui_mode=ui_mode)

            if exit_code != 0:
                raise RemoteError("Failed to run rsync")
示例#18
0
def section_silent():
    arr = torch.zeros((1000, 1000))

    for i in range(N):
        with monit.section('run', is_silent=True):
            for t in range(10):
                arr += 1
示例#19
0
    def __load_configs(self):
        config_file = self.home / _CONFIG_FILE_NAME

        if config_file.exists():
            with open(str(config_file)) as f:
                config = util.yaml_load(f.read())
                if config is None:
                    config = {}
        else:
            with monit.section('Creating a .labml config'):
                from uuid import uuid1
                config = {'uuid': uuid1().hex}
                with open(str(config_file), 'w') as f:
                    f.write(util.yaml_dump(config))

        default_config = self.__default_config()
        for k, v in default_config.items():
            if k not in config:
                config[k] = v

        self.uuid = config['uuid']
        web_api_url = config['web_api']
        if web_api_url[0:4] != 'http':
            web_api_url = f"https://api.lab-ml.com/api/v1/computer?labml_token={web_api_url}&"
        self.web_api = WebAPIConfigs(
            url=web_api_url,
            frequency=config['web_api_frequency'],
            verify_connection=config['web_api_verify_connection'],
            open_browser=config['web_api_open_browser'])
示例#20
0
文件: process.py 项目: snapbuy/labml
def run():
    pid = get_running_process()
    if pid:
        raise RuntimeError(
            f'This computer is already being monitored. PID: {pid}')

    from uuid import uuid1
    session_uuid = uuid1().hex
    with open(str(computer_singleton().config_folder / 'session.txt'),
              'w') as f:
        f.write(session_uuid)

    with open(str(computer_singleton().config_folder / 'monitor.pid'),
              'w') as f:
        f.write(str(os.getpid()))

    m = monitor.MonitorComputer(session_uuid)

    m.start({
        'os': monitor.get_os(),
        'cpu.logical': psutil.cpu_count(),
        'cpu.physical': psutil.cpu_count(logical=False)
    })

    while True:
        with monit.section('Track'):
            m.track()
        time.sleep(60)
示例#21
0
def save_bundle(path: Path,
                run_uuid: str,
                checkpoint: int = -1,
                *,
                data_files: List[str]):
    run_path = get_run_by_uuid(run_uuid)
    if run_path is None:
        raise RuntimeError(f"Couldn't find run {run_uuid}")

    checkpoint = _get_run_checkpoint(run_path, checkpoint)

    if checkpoint is None:
        raise RuntimeError(f"Couldn't find checkpoint {run_uuid}:{checkpoint}")

    info_path = path.parent / f'{path.stem}.info.json'
    info = {'uuid': run_uuid, 'checkpoint': checkpoint}
    with open(str(info_path), 'w') as f:
        f.write(json.dumps(info))

    checkpoint_path = run_path / "checkpoints" / str(checkpoint)

    with monit.section('Create bundle'):
        with tarfile.open(str(path), 'w:gz') as tar:
            tar.add(str(checkpoint_path), 'checkpoint')
            tar.add(str(run_path / 'run.yaml'), 'run.yaml')
            tar.add(str(run_path / 'configs.yaml'), 'configs.yaml')
            tar.add(str(info_path), 'info.json')
            for f in data_files:
                tar.add(str(lab.get_data_path() / f), f'data/{f}')

    info_path.unlink()
示例#22
0
def load_index(conf: Configs, n_probe: int = 8):
    """
    ## Load the index
    """
    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1

    # Load FAISS index
    with monit.section('Load index'):
        index = faiss.read_index(str(lab.get_data_path() / 'faiss.index'))
    # Set number of cells to probe
    index.nprobe = n_probe

    # Load memory mapped numpy arrays
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='r',
                           shape=(n_keys, d_model))
    vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'),
                           dtype=np.int,
                           mode='r',
                           shape=(n_keys, 1))

    return index, keys_store, vals_store
示例#23
0
    def eval(self,
             command: str,
             *,
             log_dir: Optional[Path],
             is_silent=True) -> EvalResult:

        with monit.section(f'Exec: {command}', is_silent=is_silent):
            stdin, stdout, stderr = self.client.exec_command(command)
            out = stdout.read()
            err = stderr.read()
            exit_code = stdout.channel.recv_exit_status()
            if log_dir:
                with open(str(log_dir / 'stdout.log'), 'wb') as f:
                    f.write(out)
                with open(str(log_dir / 'stderr.log'), 'wb') as f:
                    f.write(err)
                with open(str(log_dir / 'exit_code'), 'w') as f:
                    f.write(str(exit_code))

            if exit_code != 0:
                monit.fail()

            return EvalResult(exit_code,
                              out.decode('utf-8').strip(),
                              err.decode('utf-8').strip())
示例#24
0
 def run(self):
     with monit.section("Initialize"):
         self.init()
     _ = self.validator
     _ = self.trainer
     for _ in self.training_loop:
         self.run_step()
示例#25
0
def section():
    arr = torch.zeros((1000, 1000))

    for i in range(N):
        with monit.section('run'):
            for t in range(10):
                arr += 1
示例#26
0
def run(is_check_process: bool = True, open_browser: bool = True):
    pid = get_running_process()
    if is_check_process and pid:
        raise RuntimeError(
            f'This computer is already being monitored. PID: {pid}')

    from uuid import uuid1
    session_uuid = uuid1().hex
    with open(str(computer_singleton().config_folder / 'session.txt'),
              'w') as f:
        f.write(session_uuid)

    with open(str(computer_singleton().config_folder / 'monitor.pid'),
              'w') as f:
        f.write(str(os.getpid()))

    m = monitor.MonitorComputer(session_uuid, open_browser)

    m.start()

    i = 0
    while True:
        with monit.section('Track', is_new_line=False):
            m.track()
        time.sleep(min(60.0, max(1.0, i / 5.0)))
        i += 1
示例#27
0
def download_repo(org: str, repo: str, idx: Optional[int]):
    zip_file = Path(lab.get_data_path() / 'download' / f'{org}_{repo}.zip')

    if zip_file.exists():
        return zip_file

    if idx is not None:
        idx_str = f"{idx:03}: "
    else:
        idx_str = ""

    with monit.section(f"{idx_str} {org}/{repo}") as s:
        try:
            zip = urllib.request.urlopen(
                f'https://github.com/{org}/{repo}/archive/master.zip')
        except urllib.error.HTTPError as e:
            print(e)
            return
        content = zip.read()

        size = len(content) // 1024
        s.message = f"{size :,}KB"

        with open(str(zip_file), 'wb') as f:
            f.write(content)

    return zip_file
示例#28
0
def main():
    predictor = get_predictor()

    with open(str(lab.get_data_path() / 'sample.py'), 'r') as f:
        sample = f.read()
    with monit.section('Evaluate'):
        evaluate(predictor, sample)
示例#29
0
def main(local_rank,
         rank,
         world_size,
         uuid,
         init_method: str = 'tcp://localhost:23456'):
    with monit.section('Distributed'):
        torch.distributed.init_process_group(
            "gloo",
            timeout=datetime.timedelta(seconds=30),
            init_method=init_method,
            rank=rank,
            world_size=world_size)
    conf = Configs()
    experiment.create(uuid=uuid, name='mnist ddp')
    experiment.distributed(local_rank, world_size)
    experiment.configs(
        conf, {
            'optimizer.optimizer': 'Adam',
            'optimizer.learning_rate': 1e-4,
            'model': 'ddp_model',
            'device.cuda_device': local_rank
        })
    conf.set_seed.set()
    experiment.add_pytorch_models(dict(model=conf.model))
    with experiment.start():
        conf.run()
示例#30
0
    def rsync(self, *, ui_mode: UIMode = UIMode.dots):
        with monit.section(f"RSync {self.conf.name}"):
            logger.log()
            exclude_path = Configs.get().exclude_file
            exclude_path = exclude_path.absolute()
            # z = compress
            # r = recursive
            # a = equivalent to (-rlptgoD) archive (recursive/preserve everything)
            # v = verbose
            # u = update (skip whats newer on receiver)
            # K = keep symlinks
            # L = transform links to dir
            # t = preserve modification times
            # l = copy links
            # p = preserve permissions
            # g = preserve group
            # o = preserve owner
            # D = preserve device files
            rsync_cmd = ['rsync', '-zravuKLt', '--executability']
            if self.conf.private_key_file is not None:
                rsync_cmd += ['-e', f'"ssh -o StrictHostKeyChecking=no -i {self.conf.private_key_file}"']
            else:
                rsync_cmd += ['-e', f'"ssh -o StrictHostKeyChecking=no"']
            if exclude_path.exists():
                rsync_cmd += [f"--exclude-from='{str(exclude_path)}'"]
            rsync_cmd += ['./']  # source
            rsync_cmd += [f'{self.conf.username}@{self.conf.hostname}:~/{self.project_name}/']  # destination

            log_dir = self._get_log_folder(f'rsync_{self.conf.name}')
            exit_code = self.local_exec.stream(' '.join(rsync_cmd),
                                               log_dir=log_dir,
                                               ui_mode=ui_mode)

            if exit_code != 0:
                raise RemoteError("Failed to run rsync")