예제 #1
0
    def valid(self):
        self.model.eval()
        test_loss = 0
        correct = 0
        idx = 0
        with torch.no_grad():
            for data, target in monit.iterate("Test", self.valid_loader):
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = F.nll_loss(output, target, reduction='none')
                values = list(loss.cpu().numpy())
                indexes = [idx + i for i in range(len(values))]
                tracker.add('valid.sample_loss', (indexes, values))

                test_loss += float(np.sum(loss.cpu().numpy()))
                pred = output.argmax(dim=1, keepdim=True)
                values = list(pred.cpu().numpy())
                tracker.add('valid.sample_pred', (indexes, values))
                correct += pred.eq(target.view_as(pred)).sum().item()

                idx += len(values)

        # Add test loss and accuracy to logger
        tracker.add({'valid.loss': test_loss / len(self.valid_dataset)})
        tracker.add({'valid.accuracy': correct / len(self.valid_dataset)})
예제 #2
0
def to_daily_packets(df: pd.DataFrame):
    volume = np.array(df['Volume'], dtype=float)
    time = np.array(df["Minute"])
    candles = np.zeros((len(df), 6), dtype=float)
    candles[:, CandleIdx.high] = np.array(df['High'])
    candles[:, CandleIdx.low] = np.array(df['Low'])
    candles[:, CandleIdx.open] = np.array(df['Open'])
    candles[:, CandleIdx.close] = np.array(df['Close'])
    candles[:, CandleIdx.volume] = volume
    candles[:, 5] = time

    dates = []
    packets = []
    current_day = None
    packet = None
    for i in monit.iterate("To daily packets", len(df)):
        d = df['Date'][i]
        if d != current_day:
            if current_day is not None:
                dates.append(current_day)
                packets.append(packet)
            current_day = d
            packet = np.zeros((390, 5), dtype=float)
        t = time[i]
        if 0 <= t < 390:
            packet[t, :] = candles[i, 0:5]
    if current_day is not None:
        dates.append(current_day)
        packets.append(packet)

    return np.array(dates), np.array(packets)
예제 #3
0
    def sample(self):
        """
        ### Sampling function to generate samples periodically while training
        """

        # Starting prompt
        prompt = self.prompt

        # Collect output for printing
        log = [(prompt, Text.subtle)]
        # memory
        mem = []
        # Sample 25 tokens
        for i in monit.iterate('Sample', 25):
            # Tokenize the prompt
            data = self.text.text_to_i(prompt).unsqueeze(-1)
            data = data[-1:]
            data = data.to(self.device)
            # Get the model output
            output, new_mem = self.model(data, mem)
            # Get the model prediction (greedy)
            output = output.argmax(dim=-1).squeeze(1)
            # Add the prediction to prompt
            prompt += self.prompt_separator + self.text.itos[output[-1]]
            # Add the prediction for logging
            log += [(self.prompt_separator + self.text.itos[output[-1]],
                     Text.value)]
            # Memory
            mem = self.merge_memory(mem, new_mem)

        # Print the sampled output
        logger.log(log)
예제 #4
0
파일: rnn.py 프로젝트: whiskey1/samples
    def _test(self):
        self.encoder.eval()

        with torch.no_grad():
            macro_f1s = []
            test_losses = []
            for input_tensor, target_tensor in monit.iterate(
                    "test", self.test_loader):
                encoder_hidden = self.encoder.init_hidden(
                    self.device).double().to(self.device)

                input_tensor = input_tensor.to(self.device).unsqueeze(1)
                target_tensor = target_tensor.to(self.device).double()

                encoder_output, encoder_hidden = self.encoder(
                    input_tensor, encoder_hidden)

                test_loss = self.loss(encoder_output, target_tensor)

                macro_f1 = f1_score(
                    y_true=target_tensor.cpu().detach().numpy().ravel(),
                    y_pred=encoder_output.cpu().detach().to(
                        torch.int32).numpy().ravel(),
                    average='macro')

                test_losses.append(test_loss)
                macro_f1s.append(macro_f1)

            tracker.save(test_loss=np.mean(test_losses),
                         accuracy=np.mean(macro_f1s))
예제 #5
0
    def sample(self):
        """
        ### Sampling function to generate samples periodically while training
        """

        # Starting prompt
        prompt = 'It is'
        # Collect output for printing
        log = [(prompt, Text.subtle)]
        # Sample 25 tokens
        for i in monit.iterate('Sample', 25):
            # Tokenize the prompt
            data = self.dataset.text_to_i(prompt).unsqueeze(-1)
            data = data.to(self.device)
            # Get the model output
            output = self.model(data)
            # Get the model prediction (greedy)
            output = output.argmax(dim=-1).squeeze()
            # Add the prediction to prompt
            prompt += self.dataset.itos[output[-1].item()]
            # Add the prediction for logging
            log += [(self.dataset.itos[output[-1].item()], Text.value)]

        # Print the sampled output
        logger.log(log)
예제 #6
0
    def encode(self, data: str, *, is_silent: bool = True):
        words = self.word_tokenizer.tokenize(data, is_silent=is_silent)

        res = []
        for w in monit.iterate('Encode words', words, is_silent=is_silent):
            res += self.bpe.encode(w)

        return res
예제 #7
0
def build_index(conf: Configs,
                n_centeroids: int = 2048,
                code_size: int = 64,
                n_probe: int = 8,
                n_train: int = 200_000):
    """
    ## Build FAISS index

    [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started),
    [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search),
    and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint)
    tutorials on FAISS will help you learn more about FAISS usage.
    """
    # Dimensions of $f(c_i)$
    d_model = conf.transformer.d_model
    # Training data loader
    data_loader = conf.trainer.data_loader
    # Number of contexts; i.e. number of tokens in the training data minus one.
    # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$
    n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1

    # Build an index with Verenoi cell based faster search with compression that
    # doesn't store full vectors.
    quantizer = faiss.IndexFlatL2(d_model)
    index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8)
    index.nprobe = n_probe

    # Load the memory mapped numpy array of keys
    keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'),
                           dtype=np.float32,
                           mode='r',
                           shape=(n_keys, d_model))

    # Pick a random sample of keys to train the index with
    random_sample = np.random.choice(np.arange(n_keys),
                                     size=[min(n_train, n_keys)],
                                     replace=False)

    with monit.section('Train index'):
        # Train the index to store the keys
        index.train(keys_store[random_sample])

    # Add keys to the index; $\big(f(c_i), i\big)$
    for s in monit.iterate('Index', range(0, n_keys, 1024)):
        e = min(s + 1024, n_keys)
        # $f(c_i)$
        keys = keys_store[s:e]
        # $i$
        idx = np.arange(s, e)
        # Add to index
        index.add_with_ids(keys, idx)

    with monit.section('Save'):
        # Save the index
        faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
예제 #8
0
    def sample(self):
        prompt = 'def train('
        log = [(prompt, Text.subtle)]
        for i in monit.iterate('Sample', 25):
            data = self.text.text_to_i(prompt).unsqueeze(-1)
            data = data.to(self.device)
            output, *_ = self.model(data)
            output = output.argmax(dim=-1).squeeze()
            prompt += '' + self.text.itos[output[-1]]
            log += [('' + self.text.itos[output[-1]], Text.value)]

        logger.log(log)
예제 #9
0
    def interpolate_animate(self,
                            x1: torch.Tensor,
                            x2: torch.Tensor,
                            n_frames: int = 100,
                            t_: int = 100,
                            create_video=True):
        """
        #### Interpolate two images $x_0$ and $x'_0$ and make a video

        * `x1` is $x_0$
        * `x2` is $x'_0$
        * `n_frames` is the number of frames for the image
        * `t_` is $t$
        * `create_video` specifies whether to make a video or to show each frame
        """

        # Show original images
        self.show_image(x1, "x1")
        self.show_image(x2, "x2")
        # Add batch dimension
        x1 = x1[None, :, :, :]
        x2 = x2[None, :, :, :]
        # $t$ tensor
        t = torch.full((1, ), t_, device=self.device)
        # $x_t \sim q(x_t|x_0)$
        x1t = self.diffusion.q_sample(x1, t)
        # $x'_t \sim q(x'_t|x_0)$
        x2t = self.diffusion.q_sample(x2, t)

        frames = []
        # Get frames with different $\lambda$
        for i in monit.iterate('Interpolate',
                               n_frames + 1,
                               is_children_silent=True):
            # $\lambda$
            lambda_ = i / n_frames
            # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$
            xt = (1 - lambda_) * x1t + lambda_ * x2t
            # $$\bar{x}_0 \sim \textcolor{cyan}{p_\theta}(x_0|\bar{x}_t)$$
            x0 = self._sample_x0(xt, t_)
            # Add to frames
            frames.append(x0[0])
            # Show frame
            if not create_video:
                self.show_image(x0[0], f"{lambda_ :.2f}")

        # Make video
        if create_video:
            self.make_video(frames)
예제 #10
0
    def _test(self):
        self.model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in monit.iterate("Test", self.test_loader):
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        # Add test loss and accuracy to logger
        tracker.add({'valid.loss': test_loss / len(self.test_loader.dataset)})
        tracker.add({'valid.accuracy': correct / len(self.test_loader.dataset)})
예제 #11
0
def parse(df: pd.DataFrame):
    time = np.zeros((len(df)), dtype=int)
    date = []
    for i in monit.iterate("Calculate time", len(df)):
        hour = int(df['Time'][i][0:2])
        mint = int(df['Time'][i][3:5])
        time[i] = hour * 60 + mint
        mon = df['Date'][i][0:2]
        day = df['Date'][i][3:5]
        year = df['Date'][i][6:10]
        date.append(f"{year}-{mon}-{day}")

    time = time - 570
    df['Minute'] = time
    df['Date'] = date

    return df
예제 #12
0
def extract_tar(tar_file: Path, to_path: Path):
    """
    Extract a ``.tar.gz`` file.

    Arguments:
        tar_file (Path): ``.tar.gz`` file
        to_path (Path): location to extract the contents
    """
    with tarfile.open(str(tar_file), 'r:gz') as tar:
        files = tar.getmembers()

        for f in monit.iterate('Extract tar', files):
            if f.isdir():
                pass
            elif f.isfile():
                _extract_tar_file(tar, f, to_path / f.name)
            else:
                logger.log(f'Unknown file type {f.name}', Text.warning)
예제 #13
0
파일: evaluate.py 프로젝트: weihaoxie/nn
    def _sample_x0(self, xt: torch.Tensor, n_steps: int):
        """
        #### Sample an image using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$

        * `xt` is $x_t$
        * `n_steps` is $t$
        """

        # Number of sampels
        n_samples = xt.shape[0]
        # Iterate until $t$ steps
        for t_ in monit.iterate('Denoise', n_steps):
            t = n_steps - t_ - 1
            # Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
            xt = self.diffusion.p_sample(xt, xt.new_full((n_samples,), t, dtype=torch.long))

        # Return $x_0$
        return xt
예제 #14
0
    def sample(self):
        prompt = 'def train('
        log = [(prompt, Text.subtle)]
        state = None
        for i in monit.iterate('Sample', 25):
            data = self.text.text_to_i(prompt).unsqueeze(-1)
            data = data.to(self.device)
            output, new_state = self.model(data, state)
            output = output.argmax(dim=-1).squeeze(1)
            prompt += '' + self.text.tokenizer.itos[output[-1]]
            if self.is_token_by_token:
                prompt = self.text.tokenizer.itos[output[-1]]
            else:
                prompt += '' + self.text.tokenizer.itos[output[-1]]
            log += [('' + self.text.tokenizer.itos[output[-1]], Text.value)]
            state = self.state_updater(state, new_state)

        logger.log(log)
예제 #15
0
def validate(model, valid_loader, device):
    model.eval()
    valid_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in monit.iterate("valid", valid_loader):
            data, target = data.to(device), target.to(device)

            output = model(data)
            valid_loss += F.cross_entropy(output, target,
                                          reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    valid_loss /= len(valid_loader.dataset)
    valid_accuracy = 100. * correct / len(valid_loader.dataset)

    # **Save stats**
    tracker.save({'loss.valid': valid_loss, 'accuracy.valid': valid_accuracy})
예제 #16
0
    def sample_animation(self,
                         n_frames: int = 1000,
                         create_video: bool = True):
        """
        #### Sample an image step-by-step using $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$

        We sample an image step-by-step using $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$ and at each step
        show the estimate
        $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}}
         \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{cyan}{\epsilon_\theta}(x_t, t) \Big)$$
        """

        # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
        xt = torch.randn(
            [1, self.image_channels, self.image_size, self.image_size],
            device=self.device)

        # Interval to log $\hat{x}_0$
        interval = self.n_steps // n_frames
        # Frames for video
        frames = []
        # Sample $T$ steps
        for t_inv in monit.iterate('Denoise', self.n_steps):
            # $t$
            t_ = self.n_steps - t_inv - 1
            # $t$ in a tensor
            t = xt.new_full((1, ), t_, dtype=torch.long)
            # $\textcolor{cyan}{\epsilon_\theta}(x_t, t)$
            eps_theta = self.eps_model(xt, t)
            if t_ % interval == 0:
                # Get $\hat{x}_0$ and add to frames
                x0 = self.p_x0(xt, t, eps_theta)
                frames.append(x0[0])
                if not create_video:
                    self.show_image(x0[0], f"{t_}")
            # Sample from $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$
            xt = self.p_sample(xt, t, eps_theta)

        # Make video
        if create_video:
            self.make_video(frames)
예제 #17
0
    def iterate(self):
        """
        ### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$

        This updates the strategies for $T$ iterations.
        """

        # Loop for `epochs` times
        for t in monit.iterate('Train', self.epochs):
            # Walk tree and update regrets for each player
            for i in range(self.n_players):
                self.walk_tree(self.create_new_history(), cast(Player, i), 1, 1)

            # Track data for analytics
            tracker.add_global_step()
            self.tracker(self.info_sets)
            tracker.save()

            # Save checkpoints every $1,000$ iterations
            if (t + 1) % 1_000 == 0:
                experiment.save_checkpoint()
예제 #18
0
    def run(self):
        for _ in self.training_loop:
            prompt = 'def train('
            log = [(prompt, Text.subtle)]
            for i in monit.iterate('Sample', 25):
                data = self.text.text_to_i(prompt).unsqueeze(-1)
                data = data.to(self.device)
                output, *_ = self.model(data)
                output = output.argmax(dim=-1).squeeze()
                prompt += '' + self.text.itos[output[-1]]
                log += [('' + self.text.itos[output[-1]], Text.value)]

            logger.log(log)

            with Mode(is_train=True,
                      is_log_parameters=self.is_log_parameters,
                      is_log_activations=self.is_log_activations):
                with tracker.namespace('train'):
                    self.trainer()
            with tracker.namespace('valid'):
                self.validator()
예제 #19
0
    def sample(self):
        """
        Sampling function to generate samples periodically while training
        """
        prompt = self.prompt
        log = [(prompt, Text.subtle)]
        # Sample 25 tokens
        for i in monit.iterate('Sample', 25):
            # Tokenize the prompt
            data = self.text.text_to_i(prompt).unsqueeze(-1)
            data = data.to(self.device)
            # Get the model output
            output = self.model(data)
            # Get the model prediction (greedy)
            output = output.argmax(dim=-1).squeeze()
            # Add the prediction to prompt
            prompt += self.prompt_separator + self.text.itos[output[-1]]
            # Add the prediction for logging
            log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)]

        logger.log(log)
예제 #20
0
    def train(self):
        """
        ### Train
        """

        # Iterate through the dataset
        for data in monit.iterate('Train', self.data_loader):
            # Increment global step
            tracker.add_global_step()
            # Move data to device
            data = data.to(self.device)

            # Make the gradients zero
            self.optimizer.zero_grad()
            # Calculate loss
            loss = self.diffusion.loss(data)
            # Compute gradients
            loss.backward()
            # Take an optimization step
            self.optimizer.step()
            # Track the loss
            tracker.save('loss', loss)
예제 #21
0
    def sample(self):
        """
        ### Sample images
        """
        with torch.no_grad():
            # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
            x = torch.randn([
                self.n_samples, self.image_channels, self.image_size,
                self.image_size
            ],
                            device=self.device)

            # Remove noise for $T$ steps
            for t_ in monit.iterate('Sample', self.n_steps):
                # $t$
                t = self.n_steps - t_ - 1
                # Sample from $\color{cyan}{p_\theta}(x_{t-1}|x_t)$
                x = self.diffusion.p_sample(
                    x, x.new_full((self.n_samples, ), t, dtype=torch.long))

            # Log samples
            tracker.save('sample', x)
예제 #22
0
    def __init__(self,
                 path: PurePath,
                 tokenizer: Callable,
                 train: str,
                 valid: str,
                 test: str,
                 *,
                 n_tokens: Optional[int] = None,
                 stoi: Optional[Dict[str, int]] = None,
                 itos: Optional[List[str]] = None):
        self.test = test
        self.valid = valid
        self.train = train
        self.tokenizer = tokenizer
        self.path = path

        if n_tokens or stoi or itos:
            assert stoi and itos and n_tokens
            self.n_tokens = n_tokens
            self.stoi = stoi
            self.itos = itos
        else:
            self.n_tokens = len(self.standard_tokens)
            self.stoi = {t: i for i, t in enumerate(self.standard_tokens)}

            with monit.section("Tokenize"):
                tokens = self.tokenizer(self.train) + self.tokenizer(
                    self.valid)
                tokens = sorted(list(set(tokens)))

            for t in monit.iterate("Build vocabulary", tokens):
                self.stoi[t] = self.n_tokens
                self.n_tokens += 1

            self.itos = [''] * self.n_tokens
            for t, n in self.stoi.items():
                self.itos[n] = t
예제 #23
0
 def learn(self, merges: int):
     for i in monit.iterate('BPE', merges):
         while True:
             res = self.merge_pair()
             if res is not None:
                 break
예제 #24
0
    def sample(self):
        """
        ### Evaluation

        We use the sampling function to evaluate the model on a set of problems
        """

        # Skip in the first epoch
        if self.training_loop.idx < 1:
            return

        # Create a dataset to generate problems
        dataset = ArithmeticDataset(self.seq_len, self.max_digits, 1)
        # Get a set of problems and answers
        qa = [dataset.get_qa() for _ in range(self.n_tests)]
        # Collect the problems only
        questions = [p[0] for p in qa]

        # Create a tensor with only the initial token
        data = torch.tensor([[dataset.stoi[p[0]] for p in questions]])
        # Move to device
        data = data.to(self.device)

        # Number of sequences that have completed
        finished = torch.zeros((len(questions), )).bool().to(self.device)
        # Token id of the new line character - this marks end of the answer
        new_line = dataset.stoi['\n']

        # Sampled results
        results = [p[0] for p in questions]

        # Sample upto sequence length
        for i in monit.iterate('Sample', self.seq_len - 1):
            # If all the sequences have completed we skip this
            if finished.sum() == len(finished):
                continue

            # Get the model output
            output, *_ = self.model(data)
            # Get the model prediction (greedy)
            output = output[-1].argmax(dim=-1)

            # Find which sequences have finished
            finished = finished | (output == new_line)
            # Skip if all have finished
            if finished.sum() == len(finished):
                continue

            # Override with the question
            for j, p in enumerate(questions):
                if len(p) > i + 1:
                    output[j] = dataset.stoi[p[i + 1]]

            # Add the next token to the input
            data = torch.cat([data, output[None, :]], dim=0)

            # Get the sampled results
            for j, c in enumerate(output):
                results[j] += dataset.itos[c]

        # Discard everything after the answer in the results
        results = [r.split('\n')[0] for r in results]

        # Log a sample
        res_sample = results[0].split(';')
        logger.log([(res_sample[0], Text.key), (';', Text.subtle),
                    (';'.join(res_sample[1:]), Text.none)])

        # Get the answers
        results = [r.split('x==')[-1] for r in results]

        # Count the number of correct answers
        correct = 0
        for r, _qa in zip(results, qa):
            if r == _qa[1]:
                correct += 1

        # Log the score
        tracker.save('score', correct / len(results))
예제 #25
0
def build_dataset(chunk_len: int = 16, chunks_per_sample: int = 32, skip_range: int = 8):
    """
    ## Build the dataset

    * `chunk_len` is the chunk length
    * `chunks_per_sample` is the number of chunks per training sample
    * `skip_range` is the maximum number of characters to skip between two samples.
        We skip a few characters between samples to make sure the samples
        aren't aligned perfectly with the chunks in the [database](database.html)
    """

    # Load the text file
    dataset = TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        list,
        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')

    # Training portion of it
    text = dataset.train

    # Load the index for retrieving neighbors
    index = RetroIndex()

    # The input sample offsets
    sample_offsets = []
    # Cursor for the text
    i = 0
    while i < len(text):
        # Skip a few characters to make sure it's not aligned with the neighbors
        skip = np.random.randint(skip_range)
        i += skip

        # Stop if we've reached the end of the text
        if i + chunks_per_sample * chunk_len > len(text):
            break

        # Collect the offset
        sample_offsets.append(i)

        # Increment the cursor
        i += chunks_per_sample * chunk_len

    # For samples
    samples = []
    # Iterate through sample offsets
    for i in monit.iterate('Gather Neighbors', sample_offsets):
        # Get the sample including an extra character (for prediction)
        sample = text[i: i + chunks_per_sample * chunk_len + 1]
        # The input
        src = sample[:-1]
        # Break it into chunks
        chunks = [src[j:j + chunk_len] for j in range(0, len(src), chunk_len)]
        # The chunk offsets
        chunk_offsets = [j + i for j in range(0, len(src), chunk_len)]

        # Retrieve nearest neighbors
        neighbor_offsets = index(chunks, chunk_offsets)

        # Get neighbor texts. The neighbor length is twice the `chunk_len`
        neighbors = [[text[j: j + chunk_len * 2] for j in n_off] for n_off in neighbor_offsets]

        # Add to list of samples
        samples.append((sample[:-1], sample[1:], neighbors))

    # Save the samples in JSON.
    # We don't need to use complex dataset storage mechanisms or pre-tokenize
    # since our dataset is small.
    with open(str(lab.get_data_path() / 'retro_train_dataset.json'), 'w') as f:
        f.write(json.dumps(samples))
예제 #26
0
def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256,
                   code_size: int = 64, n_probe: int = 8, n_train: int = 50_000):
    """
    ## Build Database

    * `chunk_len` is the length of a chunk (number of characters)
    * `batch_size` is the batch size to use when calculating $\text{B\small{ERT}}(N)$
    * `d_emb` is the number of features in $\text{B\small{ERT}}(N)$ embeddings
        [lists to select in FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html)
    * `n_centeroids` is the number of lists in the index
    * `code_size` encoded vector size in the index
    * `n_probe` is the number of lists to probe
    * `n_train' is the number of keys to train the index on
    """

    # Load the dataset text file
    dataset = TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        list,
        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')

    # Get training data (a string)
    text = dataset.train

    # Split the text into chunks of `chunk_length`
    chunks = [text[i:i + chunk_len] for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)]
    # Get the offsets of each of the chunks
    chunk_offsets = np.array([i for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)])
    # Number of chunks
    n_chunks = len(chunks)

    # Initialize BERT to get $\text{B\small{ERT}}(N)$
    bert = BERTChunkEmbeddings(torch.device('cuda:0'))

    # Get chunk embeddings by processing `batch_size` number of chunks on each iteration
    chunk_emb = []
    for i in monit.iterate('Get embeddings', range(0, n_chunks, batch_size)):
        chunk_emb.append(bert(chunks[i: i + batch_size]).cpu())
    # Merge them into a single tensor
    chunk_emb = torch.cat(chunk_emb, dim=0).numpy()

    # Create the [FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html)
    quantizer = faiss.IndexFlatL2(d_emb)
    index = faiss.IndexIVFPQ(quantizer, d_emb, n_centeroids, code_size, 8)
    index.nprobe = n_probe

    # Get a random sample of the the chunk indexes
    random_sample = np.random.choice(np.arange(n_chunks), size=[min(n_train, n_chunks)], replace=False)

    # Train the index to store the keys
    with monit.section('Train index'):
        index.train(chunk_emb[random_sample])

    # Add the chunks to the index in batches of size `1024`
    for s in monit.iterate('Index', range(0, n_chunks, 1024)):
        e = min(s + 1024, n_chunks)
        # Add to index
        index.add_with_ids(chunk_emb[s:e], chunk_offsets[s: e])

    # Save the index
    with monit.section('Save'):
        faiss.write_index(index, str(lab.get_data_path() / 'retro.index'))
예제 #27
0
from labml import monit
from labml_app.db import computer, init_db

res = {}

init_db()
computer_keys = computer.Computer.get_all()
for computer_key in monit.iterate('computers', computer_keys):
    c: computer.Computer = computer_key.load()

    if type(c.sessions) == list:
        c.sessions = set()
        c.save()

print(res)
예제 #28
0
def fill_empty_minutes_in_packets(packets: np.ndarray):
    for i in monit.iterate("Fill empty minutes", packets.shape[0]):
        fill_empty_minutes_in_packet(packets[i])