def valid(self): self.model.eval() test_loss = 0 correct = 0 idx = 0 with torch.no_grad(): for data, target in monit.iterate("Test", self.valid_loader): data, target = data.to(self.device), target.to(self.device) output = self.model(data) loss = F.nll_loss(output, target, reduction='none') values = list(loss.cpu().numpy()) indexes = [idx + i for i in range(len(values))] tracker.add('valid.sample_loss', (indexes, values)) test_loss += float(np.sum(loss.cpu().numpy())) pred = output.argmax(dim=1, keepdim=True) values = list(pred.cpu().numpy()) tracker.add('valid.sample_pred', (indexes, values)) correct += pred.eq(target.view_as(pred)).sum().item() idx += len(values) # Add test loss and accuracy to logger tracker.add({'valid.loss': test_loss / len(self.valid_dataset)}) tracker.add({'valid.accuracy': correct / len(self.valid_dataset)})
def to_daily_packets(df: pd.DataFrame): volume = np.array(df['Volume'], dtype=float) time = np.array(df["Minute"]) candles = np.zeros((len(df), 6), dtype=float) candles[:, CandleIdx.high] = np.array(df['High']) candles[:, CandleIdx.low] = np.array(df['Low']) candles[:, CandleIdx.open] = np.array(df['Open']) candles[:, CandleIdx.close] = np.array(df['Close']) candles[:, CandleIdx.volume] = volume candles[:, 5] = time dates = [] packets = [] current_day = None packet = None for i in monit.iterate("To daily packets", len(df)): d = df['Date'][i] if d != current_day: if current_day is not None: dates.append(current_day) packets.append(packet) current_day = d packet = np.zeros((390, 5), dtype=float) t = time[i] if 0 <= t < 390: packet[t, :] = candles[i, 0:5] if current_day is not None: dates.append(current_day) packets.append(packet) return np.array(dates), np.array(packets)
def sample(self): """ ### Sampling function to generate samples periodically while training """ # Starting prompt prompt = self.prompt # Collect output for printing log = [(prompt, Text.subtle)] # memory mem = [] # Sample 25 tokens for i in monit.iterate('Sample', 25): # Tokenize the prompt data = self.text.text_to_i(prompt).unsqueeze(-1) data = data[-1:] data = data.to(self.device) # Get the model output output, new_mem = self.model(data, mem) # Get the model prediction (greedy) output = output.argmax(dim=-1).squeeze(1) # Add the prediction to prompt prompt += self.prompt_separator + self.text.itos[output[-1]] # Add the prediction for logging log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)] # Memory mem = self.merge_memory(mem, new_mem) # Print the sampled output logger.log(log)
def _test(self): self.encoder.eval() with torch.no_grad(): macro_f1s = [] test_losses = [] for input_tensor, target_tensor in monit.iterate( "test", self.test_loader): encoder_hidden = self.encoder.init_hidden( self.device).double().to(self.device) input_tensor = input_tensor.to(self.device).unsqueeze(1) target_tensor = target_tensor.to(self.device).double() encoder_output, encoder_hidden = self.encoder( input_tensor, encoder_hidden) test_loss = self.loss(encoder_output, target_tensor) macro_f1 = f1_score( y_true=target_tensor.cpu().detach().numpy().ravel(), y_pred=encoder_output.cpu().detach().to( torch.int32).numpy().ravel(), average='macro') test_losses.append(test_loss) macro_f1s.append(macro_f1) tracker.save(test_loss=np.mean(test_losses), accuracy=np.mean(macro_f1s))
def sample(self): """ ### Sampling function to generate samples periodically while training """ # Starting prompt prompt = 'It is' # Collect output for printing log = [(prompt, Text.subtle)] # Sample 25 tokens for i in monit.iterate('Sample', 25): # Tokenize the prompt data = self.dataset.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) # Get the model output output = self.model(data) # Get the model prediction (greedy) output = output.argmax(dim=-1).squeeze() # Add the prediction to prompt prompt += self.dataset.itos[output[-1].item()] # Add the prediction for logging log += [(self.dataset.itos[output[-1].item()], Text.value)] # Print the sampled output logger.log(log)
def encode(self, data: str, *, is_silent: bool = True): words = self.word_tokenizer.tokenize(data, is_silent=is_silent) res = [] for w in monit.iterate('Encode words', words, is_silent=is_silent): res += self.bpe.encode(w) return res
def build_index(conf: Configs, n_centeroids: int = 2048, code_size: int = 64, n_probe: int = 8, n_train: int = 200_000): """ ## Build FAISS index [Getting started](https://github.com/facebookresearch/faiss/wiki/Getting-started), [faster search](https://github.com/facebookresearch/faiss/wiki/Faster-search), and [lower memory footprint](https://github.com/facebookresearch/faiss/wiki/Lower-memory-footprint) tutorials on FAISS will help you learn more about FAISS usage. """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Build an index with Verenoi cell based faster search with compression that # doesn't store full vectors. quantizer = faiss.IndexFlatL2(d_model) index = faiss.IndexIVFPQ(quantizer, d_model, n_centeroids, code_size, 8) index.nprobe = n_probe # Load the memory mapped numpy array of keys keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='r', shape=(n_keys, d_model)) # Pick a random sample of keys to train the index with random_sample = np.random.choice(np.arange(n_keys), size=[min(n_train, n_keys)], replace=False) with monit.section('Train index'): # Train the index to store the keys index.train(keys_store[random_sample]) # Add keys to the index; $\big(f(c_i), i\big)$ for s in monit.iterate('Index', range(0, n_keys, 1024)): e = min(s + 1024, n_keys) # $f(c_i)$ keys = keys_store[s:e] # $i$ idx = np.arange(s, e) # Add to index index.add_with_ids(keys, idx) with monit.section('Save'): # Save the index faiss.write_index(index, str(lab.get_data_path() / 'faiss.index'))
def sample(self): prompt = 'def train(' log = [(prompt, Text.subtle)] for i in monit.iterate('Sample', 25): data = self.text.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) output, *_ = self.model(data) output = output.argmax(dim=-1).squeeze() prompt += '' + self.text.itos[output[-1]] log += [('' + self.text.itos[output[-1]], Text.value)] logger.log(log)
def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int = 100, t_: int = 100, create_video=True): """ #### Interpolate two images $x_0$ and $x'_0$ and make a video * `x1` is $x_0$ * `x2` is $x'_0$ * `n_frames` is the number of frames for the image * `t_` is $t$ * `create_video` specifies whether to make a video or to show each frame """ # Show original images self.show_image(x1, "x1") self.show_image(x2, "x2") # Add batch dimension x1 = x1[None, :, :, :] x2 = x2[None, :, :, :] # $t$ tensor t = torch.full((1, ), t_, device=self.device) # $x_t \sim q(x_t|x_0)$ x1t = self.diffusion.q_sample(x1, t) # $x'_t \sim q(x'_t|x_0)$ x2t = self.diffusion.q_sample(x2, t) frames = [] # Get frames with different $\lambda$ for i in monit.iterate('Interpolate', n_frames + 1, is_children_silent=True): # $\lambda$ lambda_ = i / n_frames # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$ xt = (1 - lambda_) * x1t + lambda_ * x2t # $$\bar{x}_0 \sim \textcolor{cyan}{p_\theta}(x_0|\bar{x}_t)$$ x0 = self._sample_x0(xt, t_) # Add to frames frames.append(x0[0]) # Show frame if not create_video: self.show_image(x0[0], f"{lambda_ :.2f}") # Make video if create_video: self.make_video(frames)
def _test(self): self.model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in monit.iterate("Test", self.test_loader): data, target = data.to(self.device), target.to(self.device) output = self.model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() # Add test loss and accuracy to logger tracker.add({'valid.loss': test_loss / len(self.test_loader.dataset)}) tracker.add({'valid.accuracy': correct / len(self.test_loader.dataset)})
def parse(df: pd.DataFrame): time = np.zeros((len(df)), dtype=int) date = [] for i in monit.iterate("Calculate time", len(df)): hour = int(df['Time'][i][0:2]) mint = int(df['Time'][i][3:5]) time[i] = hour * 60 + mint mon = df['Date'][i][0:2] day = df['Date'][i][3:5] year = df['Date'][i][6:10] date.append(f"{year}-{mon}-{day}") time = time - 570 df['Minute'] = time df['Date'] = date return df
def extract_tar(tar_file: Path, to_path: Path): """ Extract a ``.tar.gz`` file. Arguments: tar_file (Path): ``.tar.gz`` file to_path (Path): location to extract the contents """ with tarfile.open(str(tar_file), 'r:gz') as tar: files = tar.getmembers() for f in monit.iterate('Extract tar', files): if f.isdir(): pass elif f.isfile(): _extract_tar_file(tar, f, to_path / f.name) else: logger.log(f'Unknown file type {f.name}', Text.warning)
def _sample_x0(self, xt: torch.Tensor, n_steps: int): """ #### Sample an image using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ * `xt` is $x_t$ * `n_steps` is $t$ """ # Number of sampels n_samples = xt.shape[0] # Iterate until $t$ steps for t_ in monit.iterate('Denoise', n_steps): t = n_steps - t_ - 1 # Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ xt = self.diffusion.p_sample(xt, xt.new_full((n_samples,), t, dtype=torch.long)) # Return $x_0$ return xt
def sample(self): prompt = 'def train(' log = [(prompt, Text.subtle)] state = None for i in monit.iterate('Sample', 25): data = self.text.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) output, new_state = self.model(data, state) output = output.argmax(dim=-1).squeeze(1) prompt += '' + self.text.tokenizer.itos[output[-1]] if self.is_token_by_token: prompt = self.text.tokenizer.itos[output[-1]] else: prompt += '' + self.text.tokenizer.itos[output[-1]] log += [('' + self.text.tokenizer.itos[output[-1]], Text.value)] state = self.state_updater(state, new_state) logger.log(log)
def validate(model, valid_loader, device): model.eval() valid_loss = 0 correct = 0 with torch.no_grad(): for data, target in monit.iterate("valid", valid_loader): data, target = data.to(device), target.to(device) output = model(data) valid_loss += F.cross_entropy(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() valid_loss /= len(valid_loader.dataset) valid_accuracy = 100. * correct / len(valid_loader.dataset) # **Save stats** tracker.save({'loss.valid': valid_loss, 'accuracy.valid': valid_accuracy})
def sample_animation(self, n_frames: int = 1000, create_video: bool = True): """ #### Sample an image step-by-step using $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$ We sample an image step-by-step using $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$ and at each step show the estimate $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}} \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{cyan}{\epsilon_\theta}(x_t, t) \Big)$$ """ # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$ xt = torch.randn( [1, self.image_channels, self.image_size, self.image_size], device=self.device) # Interval to log $\hat{x}_0$ interval = self.n_steps // n_frames # Frames for video frames = [] # Sample $T$ steps for t_inv in monit.iterate('Denoise', self.n_steps): # $t$ t_ = self.n_steps - t_inv - 1 # $t$ in a tensor t = xt.new_full((1, ), t_, dtype=torch.long) # $\textcolor{cyan}{\epsilon_\theta}(x_t, t)$ eps_theta = self.eps_model(xt, t) if t_ % interval == 0: # Get $\hat{x}_0$ and add to frames x0 = self.p_x0(xt, t, eps_theta) frames.append(x0[0]) if not create_video: self.show_image(x0[0], f"{t_}") # Sample from $\textcolor{cyan}{p_\theta}(x_{t-1}|x_t)$ xt = self.p_sample(xt, t, eps_theta) # Make video if create_video: self.make_video(frames)
def iterate(self): """ ### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$ This updates the strategies for $T$ iterations. """ # Loop for `epochs` times for t in monit.iterate('Train', self.epochs): # Walk tree and update regrets for each player for i in range(self.n_players): self.walk_tree(self.create_new_history(), cast(Player, i), 1, 1) # Track data for analytics tracker.add_global_step() self.tracker(self.info_sets) tracker.save() # Save checkpoints every $1,000$ iterations if (t + 1) % 1_000 == 0: experiment.save_checkpoint()
def run(self): for _ in self.training_loop: prompt = 'def train(' log = [(prompt, Text.subtle)] for i in monit.iterate('Sample', 25): data = self.text.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) output, *_ = self.model(data) output = output.argmax(dim=-1).squeeze() prompt += '' + self.text.itos[output[-1]] log += [('' + self.text.itos[output[-1]], Text.value)] logger.log(log) with Mode(is_train=True, is_log_parameters=self.is_log_parameters, is_log_activations=self.is_log_activations): with tracker.namespace('train'): self.trainer() with tracker.namespace('valid'): self.validator()
def sample(self): """ Sampling function to generate samples periodically while training """ prompt = self.prompt log = [(prompt, Text.subtle)] # Sample 25 tokens for i in monit.iterate('Sample', 25): # Tokenize the prompt data = self.text.text_to_i(prompt).unsqueeze(-1) data = data.to(self.device) # Get the model output output = self.model(data) # Get the model prediction (greedy) output = output.argmax(dim=-1).squeeze() # Add the prediction to prompt prompt += self.prompt_separator + self.text.itos[output[-1]] # Add the prediction for logging log += [(self.prompt_separator + self.text.itos[output[-1]], Text.value)] logger.log(log)
def train(self): """ ### Train """ # Iterate through the dataset for data in monit.iterate('Train', self.data_loader): # Increment global step tracker.add_global_step() # Move data to device data = data.to(self.device) # Make the gradients zero self.optimizer.zero_grad() # Calculate loss loss = self.diffusion.loss(data) # Compute gradients loss.backward() # Take an optimization step self.optimizer.step() # Track the loss tracker.save('loss', loss)
def sample(self): """ ### Sample images """ with torch.no_grad(): # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$ x = torch.randn([ self.n_samples, self.image_channels, self.image_size, self.image_size ], device=self.device) # Remove noise for $T$ steps for t_ in monit.iterate('Sample', self.n_steps): # $t$ t = self.n_steps - t_ - 1 # Sample from $\color{cyan}{p_\theta}(x_{t-1}|x_t)$ x = self.diffusion.p_sample( x, x.new_full((self.n_samples, ), t, dtype=torch.long)) # Log samples tracker.save('sample', x)
def __init__(self, path: PurePath, tokenizer: Callable, train: str, valid: str, test: str, *, n_tokens: Optional[int] = None, stoi: Optional[Dict[str, int]] = None, itos: Optional[List[str]] = None): self.test = test self.valid = valid self.train = train self.tokenizer = tokenizer self.path = path if n_tokens or stoi or itos: assert stoi and itos and n_tokens self.n_tokens = n_tokens self.stoi = stoi self.itos = itos else: self.n_tokens = len(self.standard_tokens) self.stoi = {t: i for i, t in enumerate(self.standard_tokens)} with monit.section("Tokenize"): tokens = self.tokenizer(self.train) + self.tokenizer( self.valid) tokens = sorted(list(set(tokens))) for t in monit.iterate("Build vocabulary", tokens): self.stoi[t] = self.n_tokens self.n_tokens += 1 self.itos = [''] * self.n_tokens for t, n in self.stoi.items(): self.itos[n] = t
def learn(self, merges: int): for i in monit.iterate('BPE', merges): while True: res = self.merge_pair() if res is not None: break
def sample(self): """ ### Evaluation We use the sampling function to evaluate the model on a set of problems """ # Skip in the first epoch if self.training_loop.idx < 1: return # Create a dataset to generate problems dataset = ArithmeticDataset(self.seq_len, self.max_digits, 1) # Get a set of problems and answers qa = [dataset.get_qa() for _ in range(self.n_tests)] # Collect the problems only questions = [p[0] for p in qa] # Create a tensor with only the initial token data = torch.tensor([[dataset.stoi[p[0]] for p in questions]]) # Move to device data = data.to(self.device) # Number of sequences that have completed finished = torch.zeros((len(questions), )).bool().to(self.device) # Token id of the new line character - this marks end of the answer new_line = dataset.stoi['\n'] # Sampled results results = [p[0] for p in questions] # Sample upto sequence length for i in monit.iterate('Sample', self.seq_len - 1): # If all the sequences have completed we skip this if finished.sum() == len(finished): continue # Get the model output output, *_ = self.model(data) # Get the model prediction (greedy) output = output[-1].argmax(dim=-1) # Find which sequences have finished finished = finished | (output == new_line) # Skip if all have finished if finished.sum() == len(finished): continue # Override with the question for j, p in enumerate(questions): if len(p) > i + 1: output[j] = dataset.stoi[p[i + 1]] # Add the next token to the input data = torch.cat([data, output[None, :]], dim=0) # Get the sampled results for j, c in enumerate(output): results[j] += dataset.itos[c] # Discard everything after the answer in the results results = [r.split('\n')[0] for r in results] # Log a sample res_sample = results[0].split(';') logger.log([(res_sample[0], Text.key), (';', Text.subtle), (';'.join(res_sample[1:]), Text.none)]) # Get the answers results = [r.split('x==')[-1] for r in results] # Count the number of correct answers correct = 0 for r, _qa in zip(results, qa): if r == _qa[1]: correct += 1 # Log the score tracker.save('score', correct / len(results))
def build_dataset(chunk_len: int = 16, chunks_per_sample: int = 32, skip_range: int = 8): """ ## Build the dataset * `chunk_len` is the chunk length * `chunks_per_sample` is the number of chunks per training sample * `skip_range` is the maximum number of characters to skip between two samples. We skip a few characters between samples to make sure the samples aren't aligned perfectly with the chunks in the [database](database.html) """ # Load the text file dataset = TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', list, url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt') # Training portion of it text = dataset.train # Load the index for retrieving neighbors index = RetroIndex() # The input sample offsets sample_offsets = [] # Cursor for the text i = 0 while i < len(text): # Skip a few characters to make sure it's not aligned with the neighbors skip = np.random.randint(skip_range) i += skip # Stop if we've reached the end of the text if i + chunks_per_sample * chunk_len > len(text): break # Collect the offset sample_offsets.append(i) # Increment the cursor i += chunks_per_sample * chunk_len # For samples samples = [] # Iterate through sample offsets for i in monit.iterate('Gather Neighbors', sample_offsets): # Get the sample including an extra character (for prediction) sample = text[i: i + chunks_per_sample * chunk_len + 1] # The input src = sample[:-1] # Break it into chunks chunks = [src[j:j + chunk_len] for j in range(0, len(src), chunk_len)] # The chunk offsets chunk_offsets = [j + i for j in range(0, len(src), chunk_len)] # Retrieve nearest neighbors neighbor_offsets = index(chunks, chunk_offsets) # Get neighbor texts. The neighbor length is twice the `chunk_len` neighbors = [[text[j: j + chunk_len * 2] for j in n_off] for n_off in neighbor_offsets] # Add to list of samples samples.append((sample[:-1], sample[1:], neighbors)) # Save the samples in JSON. # We don't need to use complex dataset storage mechanisms or pre-tokenize # since our dataset is small. with open(str(lab.get_data_path() / 'retro_train_dataset.json'), 'w') as f: f.write(json.dumps(samples))
def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256, code_size: int = 64, n_probe: int = 8, n_train: int = 50_000): """ ## Build Database * `chunk_len` is the length of a chunk (number of characters) * `batch_size` is the batch size to use when calculating $\text{B\small{ERT}}(N)$ * `d_emb` is the number of features in $\text{B\small{ERT}}(N)$ embeddings [lists to select in FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html) * `n_centeroids` is the number of lists in the index * `code_size` encoded vector size in the index * `n_probe` is the number of lists to probe * `n_train' is the number of keys to train the index on """ # Load the dataset text file dataset = TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', list, url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt') # Get training data (a string) text = dataset.train # Split the text into chunks of `chunk_length` chunks = [text[i:i + chunk_len] for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)] # Get the offsets of each of the chunks chunk_offsets = np.array([i for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)]) # Number of chunks n_chunks = len(chunks) # Initialize BERT to get $\text{B\small{ERT}}(N)$ bert = BERTChunkEmbeddings(torch.device('cuda:0')) # Get chunk embeddings by processing `batch_size` number of chunks on each iteration chunk_emb = [] for i in monit.iterate('Get embeddings', range(0, n_chunks, batch_size)): chunk_emb.append(bert(chunks[i: i + batch_size]).cpu()) # Merge them into a single tensor chunk_emb = torch.cat(chunk_emb, dim=0).numpy() # Create the [FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html) quantizer = faiss.IndexFlatL2(d_emb) index = faiss.IndexIVFPQ(quantizer, d_emb, n_centeroids, code_size, 8) index.nprobe = n_probe # Get a random sample of the the chunk indexes random_sample = np.random.choice(np.arange(n_chunks), size=[min(n_train, n_chunks)], replace=False) # Train the index to store the keys with monit.section('Train index'): index.train(chunk_emb[random_sample]) # Add the chunks to the index in batches of size `1024` for s in monit.iterate('Index', range(0, n_chunks, 1024)): e = min(s + 1024, n_chunks) # Add to index index.add_with_ids(chunk_emb[s:e], chunk_offsets[s: e]) # Save the index with monit.section('Save'): faiss.write_index(index, str(lab.get_data_path() / 'retro.index'))
from labml import monit from labml_app.db import computer, init_db res = {} init_db() computer_keys = computer.Computer.get_all() for computer_key in monit.iterate('computers', computer_keys): c: computer.Computer = computer_key.load() if type(c.sessions) == list: c.sessions = set() c.save() print(res)
def fill_empty_minutes_in_packets(packets: np.ndarray): for i in monit.iterate("Fill empty minutes", packets.shape[0]): fill_empty_minutes_in_packet(packets[i])