def __call__(self): """ ### Train the model for an epoch """ # Iterate through training data for i, (src, tgt, neighbors) in monit.enum('Train', self.dataloader): # Move data to the device src, tgt, neighbors = src.to(self.device), tgt.to( self.device), neighbors.to(self.device) # Forward pass res = self.model(src, neighbors) # Calculate loss loss = self.loss_func(res.view(-1, res.shape[-1]), tgt.view(-1)) # Clear the gradients self.optimizer.zero_grad() # Backward pass loss.backward() # Optimize the model self.optimizer.step() # Save training statistics and increment the global step counter tracker.save({'loss.train': loss}) tracker.add_global_step(len(src))
def iterate(self): device = get_device(self.model) correct_sum = 0 total_samples = 0 for i, (data, target) in monit.enum(self.name, self.data_loader): data, target = data.to(device), target.to(device) if self.optimizer is not None: self.optimizer.zero_grad() output = self.model(data) loss = self.loss_func(output, target) correct_sum += self.accuracy_func(output, target) total_samples += len(target) tracker.add(".loss", loss) if self.optimizer is not None: loss.backward() self.optimizer.step() if self.is_increment_global_step: tracker.add_global_step(len(target)) if self.log_interval is not None and (i + 1) % self.log_interval == 0: tracker.save() tracker.add(".accuracy", correct_sum / total_samples)
def _train(self): for i, (images, _) in monit.enum("train", self.train_loader): targets_real = torch.empty(images.size(0), 1, device=self.device).uniform_(0.8, 1.0) targets_fake = torch.empty(images.size(0), 1, device=self.device).uniform_(0.0, 0.2) images = images.to(self.device) self.optimizer_D.zero_grad() logits_real = self.discriminator(images) fake_images = self.generator( noise(self.device, self.batch_size, self.noise_dim)).detach() logits_fake = self.discriminator(fake_images) discriminator_loss = DLoss(logits_real, logits_fake, targets_real, targets_fake) discriminator_loss.backward() self.optimizer_D.step() self.optimizer_G.zero_grad() fake_images = self.generator( noise(self.device, self.batch_size, self.noise_dim)) logits_fake = self.discriminator(fake_images) generator_loss = GLoss(logits_fake, targets_real) generator_loss.backward() self.optimizer_G.step() tracker.add(G_Loss=generator_loss.item()) tracker.add(D_Loss=discriminator_loss.item()) tracker.add_global_step() for j in range(1, 10): img = fake_images[j].squeeze() tracker.add('generated', img)
def concat_and_save(path: PurePath, source_files: List[PythonFile]): with open(str(path), 'w') as f: for i, source in monit.enum(f"Write {path.name}", source_files): f.write( f"# PROJECT: {source.project} FILE: {str(source.relative_path)}\n" ) f.write(read_file(source.path) + "\n")
def collect_pairs(self): for w, v in monit.enum('Collect pairs', self.word_codes): f = self.word_freq[w] for i in range(len(v) - 1): self.add_pair(w, i, i + 1) self.heap_add_all()
def download(): path = Path(lab.get_data_path() / 'download') if not path.exists(): path.mkdir(parents=True) get_awesome_pytorch() repos = get_repos('pytorch_awesome.md') for i, r in monit.enum("Download", repos): download_repo(r[0], r[1], i)
def progressive(overwrite: bool = False): # Get repos get_awesome_pytorch_readme() repos = get_repos_from_readme('pytorch_awesome.md') # Download zips for i, r in monit.enum(f"Download {len(repos)} repos", repos): zip_file = download_repo(r[0], r[1], i) extracted = extract_zip(zip_file, overwrite) remove_files(extracted, {'.py'})
def main(): source_files = _GetPythonFiles().files logger.inspect(source_files) with open(str(Path(os.getcwd()) / 'data' / 'all.py'), 'w') as f: for i, source in monit.enum("Parse", source_files): serialized = _read_file(source.path) # return serialized = [str(t) for t in serialized] f.write(f"{str(source.path)}\n") f.write(" ".join(serialized) + "\n")
def train(self): """ ### Train the model """ # Loop for the given number of epochs for _ in monit.loop(self.epochs): # Iterate over the minibatches for i, batch in monit.enum('Train', self.dataloader): # Move data to the device data, target = batch[0].to(self.device), batch[1].to( self.device) # Set tracker step, as the number of characters trained on tracker.add_global_step(data.shape[0] * data.shape[1]) # Set model state to training self.model.train() # Evaluate the model output = self.model(data) # Calculate loss loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1)) # Log the loss tracker.add("loss.train", loss) # Calculate gradients loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip) # Take optimizer step self.optimizer.step() # Log the model parameters and gradients if (i + 1) % 100 == 0: tracker.add('model', self.model) # Clear the gradients self.optimizer.zero_grad() # Generate a sample if (i + 1) % 100 == 0: self.model.eval() with torch.no_grad(): self.sample() # Save the tracked metrics if (i + 1) % 10 == 0: tracker.save() # Save the model experiment.save_checkpoint()
def iterate(self): stats = self.batch_step.init_stats() for i, batch in monit.enum(self.name, self.data_loader): update = self.batch_step.process(batch) self.batch_step.update_stats(stats, update) if self.is_increment_global_step: tracker.add_global_step(update['samples']) if self.log_interval is not None and (i + 1) % self.log_interval == 0: tracker.save() self.batch_step.log_stats(stats)
def validation_loss(knn_weights: List[float], last_n: Optional[int], conf: Configs, index: faiss.IndexFlatL2, keys_store: np.ndarray, vals_store: np.ndarray): """ ## Calculate validation loss We calculate the validation loss of the combined on $k$-NN prediction and transformer prediction. The weight given to the $k$-NN model is given by `knn_weight`. It's a list of weights and we calculate the validation loss for each. """ # List of losses for each `knn_weights` losses = [[] for _ in knn_weights] # Number of samples in each batch n_samples = [] with torch.no_grad(): # Iterate through validation data for i, batch in monit.enum("Validation", conf.validator.data_loader, is_children_silent=True): # Get data and target labels data, target = batch[0].to(conf.device), batch[1].to(conf.device) # Run the model and get predictions $p(w_t, c_t)$ res = conf.model(data) # Get $k$-NN predictions res_knn = knn(conf.model.ff_input.cpu(), index, keys_store, vals_store, conf.n_tokens) res_knn = res_knn.to(conf.device) # This is to calculate only the loss for `last_n` tokens. # This is important because the first predictions (along the sequence) # of transformer model has very few past tokens to look at. if last_n: res = res[-last_n:] res_knn = res_knn[-last_n:] target = target[-last_n:] # Number of samples n_s = res.shape[0] * data.shape[1] n_samples.append(n_s) # Calculate scores for each of `knn_weights`. for i, c in enumerate(knn_weights): # Calculate the loss loss = conf.loss_func(res_knn * c + (1 - c) * res, target) losses[i].append(loss * n_s) return losses, n_samples
def train(self): self.model.train() for i, (data, target) in monit.enum("Train", self.train_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.cross_entropy(output, target) loss.backward() self.optimizer.step() tracker.add({'train.loss': loss}) tracker.add_global_step() if i % self.train_log_interval == 0: tracker.save()
def gather_keys(conf: Configs): """ ## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays *Note that these numpy arrays will take up a lot of space (even few hundred gigabytes) depending on the size of your dataset*. """ # Dimensions of $f(c_i)$ d_model = conf.transformer.d_model # Training data loader data_loader = conf.trainer.data_loader # Number of contexts; i.e. number of tokens in the training data minus one. # $\big(f(c_i), w_i\big)$ for $i \in [2, T]$ n_keys = data_loader.data.shape[0] * data_loader.data.shape[1] - 1 # Numpy array for $f(c_i)$ keys_store = np.memmap(str(lab.get_data_path() / 'keys.npy'), dtype=np.float32, mode='w+', shape=(n_keys, d_model)) # Numpy array for $w_i$ vals_store = np.memmap(str(lab.get_data_path() / 'vals.npy'), dtype=np.int, mode='w+', shape=(n_keys, 1)) # Number of keys $f(c_i)$ collected added = 0 with torch.no_grad(): # Loop through data for i, batch in monit.enum("Collect data", data_loader, is_children_silent=True): # $w_i$ the target labels vals = batch[1].view(-1, 1) # Input data moved to the device of the model data = batch[0].to(conf.device) # Run the model _ = conf.model(data) # Get $f(c_i)$ keys = conf.model.ff_input.view(-1, d_model) # Save keys, $f(c_i)$ in the memory mapped numpy array keys_store[added:added + keys.shape[0]] = keys.cpu() # Save values, $w_i$ in the memory mapped numpy array vals_store[added:added + keys.shape[0]] = vals # Increment the number of collected keys added += keys.shape[0]
def train_epoch(self, model: nn.Module, data_loader: DataLoader, name: str): """ Train/Validate for an epoch """ model.train(name == 'train') correct_predictions = 0 total = 0 total_loss = 0 with torch.set_grad_enabled(name == 'train'): for i, data in monit.enum(name, data_loader): input_ids = data["input_ids"].to(self.device) attention_mask = data["attention_mask"].to(self.device) targets = data["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) loss = self.loss_fn(outputs, targets) total_loss += loss.item() * len(preds) correct_predictions += torch.sum(preds == targets).item() total += len(preds) tracker.add('loss.', loss) if name == 'train': tracker.add_global_step(len(preds)) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) self.optimizer.step() self.optimizer.zero_grad() if (i + 1) % 10 == 0: tracker.save() tracker.save('accuracy.', correct_predictions / total) mlflow.log_metric(f"{name}_acc", float(correct_predictions / total), step=tracker.get_global_step()) mlflow.log_metric(f"{name}_loss", float(total_loss / total), step=tracker.get_global_step()) return correct_predictions / total, total_loss / total
def collect_words(self, data: str): last_idx = 0 is_id = False for i, c in monit.enum('Collect words', data): if c in ID_CHARS: if not is_id: self.add_word(data[last_idx:i]) last_idx = i is_id = True else: if is_id: self.add_word(data[last_idx:i]) last_idx = i is_id = False self.add_word(data[last_idx:])
def batch(overwrite: bool = False): with monit.section('Get pytorch_awesome'): get_awesome_pytorch_readme() repos = get_repos_from_readme('pytorch_awesome.md') # Download zips for i, r in monit.enum(f"Download {len(repos)} repos", repos): download_repo(r[0], r[1], i) # Extract downloads with monit.section('Extract zips'): download = Path(lab.get_data_path() / 'download') for repo in download.iterdir(): extract_zip(repo, overwrite) with monit.section('Remove non python files'): remove_files(lab.get_data_path() / 'source', {'.py'})
def _train(self): self.model.train() for i, (data, target) in monit.enum("Train", self.train_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.nll_loss(output, target) loss.backward() self.optimizer.step() # Add training loss to the logger. # The logger will queue the values and output the mean tracker.add({'train.loss': loss}) tracker.add_global_step() # Print output to the console if i % self.log_interval == 0: # Output the indicators tracker.save()
def _train(self): for i, (input_tensor, target_tensor) in monit.enum("train", self.train_loader): encoder_hidden = self.encoder.init_hidden(self.device).double().to( self.device) input_tensor = input_tensor.to(self.device).unsqueeze(1) target_tensor = target_tensor.to(self.device).double() self.optimizer.zero_grad() encoder_output, encoder_hidden = self.encoder( input_tensor, encoder_hidden) train_loss = self.loss(encoder_output, target_tensor) train_loss.backward() self.optimizer.step() tracker.add(loss=train_loss.item()) tracker.add_global_step() tracker.save()
def train(self): for _ in monit.loop(self.epochs): for i, batch in monit.enum('Train', self.dataloader): # Move data to the device data, target = batch[0].to(self.device), batch[1].to( self.device) tracker.add_global_step(data.shape[0] * data.shape[1]) self.model.train() output = self.model(data) # Calculate and log loss loss = self.loss_func(output.view(-1, output.shape[-1]), target.view(-1)) tracker.add("loss.train", loss) # Calculate gradients loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.grad_norm_clip) # Take optimizer step self.optimizer.step() # Log the model parameters and gradients on last batch of every epoch if (i + 1) % 100 == 0: tracker.add('model', self.model) # Clear the gradients self.optimizer.zero_grad() if (i + 1) % 100 == 0: self.model.eval() with torch.no_grad(): self.sample() # Save the tracked metrics if (i + 1) % 10 == 0: tracker.save() experiment.save_checkpoint()
def train(model, optimizer, train_loader, device, train_log_interval): """This is the training code""" model.train() for batch_idx, (data, target) in monit.enum("Train", train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() # **✨ Increment the global step** tracker.add_global_step() # **✨ Store stats in the tracker** tracker.save({'loss.train': loss}) # if batch_idx % train_log_interval == 0: # **✨ Save added stats** tracker.save()
def tokenize(self, data: str, *, is_silent: bool = False) -> List[str]: last_idx = 0 is_id = False res = [] for i, c in monit.enum('Collect words', data, is_silent=is_silent): if c in ID_CHARS: if not is_id: if last_idx < i: res.append(data[last_idx:i]) last_idx = i is_id = True else: if is_id: if last_idx < i: res.append(data[last_idx:i]) last_idx = i is_id = False if last_idx < len(data): res.append(data[last_idx:]) return res
def run(self): """ ## Training We aim to solve: $$G^{*}, F^{*} = \arg \min_{G,F} \max_{D_X, D_Y} \mathcal{L}(G, F, D_X, D_Y)$$ where, $G$ translates images from $X \rightarrow Y$, $F$ translates images from $Y \rightarrow X$, $D_X$ tests if images are from $X$ space, $D_Y$ tests if images are from $Y$ space, and \begin{align} \mathcal{L}(G, F, D_X, D_Y) &= \mathcal{L}_{GAN}(G, D_Y, X, Y) \\ &+ \mathcal{L}_{GAN}(F, D_X, Y, X) \\ &+ \lambda_1 \mathcal{L}_{cyc}(G, F) \\ &+ \lambda_2 \mathcal{L}_{identity}(G, F) \\ \\ \mathcal{L}_{GAN}(G, F, D_Y, X, Y) &= \mathbb{E}_{y \sim p_{data}(y)} \Big[log D_Y(y)\Big] \\ &+ \mathbb{E}_{x \sim p_{data}(x)} \bigg[log\Big(1 - D_Y(G(x))\Big)\bigg] \\ &+ \mathbb{E}_{x \sim p_{data}(x)} \Big[log D_X(x)\Big] \\ &+ \mathbb{E}_{y \sim p_{data}(y)} \bigg[log\Big(1 - D_X(F(y))\Big)\bigg] \\ \\ \mathcal{L}_{cyc}(G, F) &= \mathbb{E}_{x \sim p_{data}(x)} \Big[\lVert F(G(x)) - x \lVert_1\Big] \\ &+ \mathbb{E}_{y \sim p_{data}(y)} \Big[\lVert G(F(y)) - y \rVert_1\Big] \\ \\ \mathcal{L}_{identity}(G, F) &= \mathbb{E}_{x \sim p_{data}(x)} \Big[\lVert F(x) - x \lVert_1\Big] \\ &+ \mathbb{E}_{y \sim p_{data}(y)} \Big[\lVert G(y) - y \rVert_1\Big] \\ \end{align} $\mathcal{L}_{GAN}$ is the generative adversarial loss from the original GAN paper. $\mathcal{L}_{cyc}$ is the cyclic loss, where we try to get $F(G(x))$ to be similar to $x$, and $G(F(y))$ to be similar to $y$. Basically if the two generators (transformations) are applied in series it should give back the original image. This is the main contribution of this paper. It trains the generators to generate an image of the other distribution that is similar to the original image. Without this loss $G(x)$ could generate anything that's from the distribution of $Y$. Now it needs to generate something from the distribution of $Y$ but still has properties of $x$, so that $F(G(x)$ can re-generate something like $x$. $\mathcal{L}_{cyc}$ is the identity loss. This was used to encourage the mapping to preserve color composition between the input and the output. To solve $G^{\*}, F^{\*}$, discriminators $D_X$ and $D_Y$ should **ascend** on the gradient, \begin{align} \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m &\Bigg[ \log D_Y\Big(y^{(i)}\Big) \\ &+ \log \Big(1 - D_Y\Big(G\Big(x^{(i)}\Big)\Big)\Big) \\ &+ \log D_X\Big(x^{(i)}\Big) \\ & +\log\Big(1 - D_X\Big(F\Big(y^{(i)}\Big)\Big)\Big) \Bigg] \end{align} That is descend on *negative* log-likelihood loss. In order to stabilize the training the negative log- likelihood objective was replaced by a least-squared loss - the least-squared error of discriminator, labelling real images with 1, and generated images with 0. So we want to descend on the gradient, \begin{align} \nabla_{\theta_{D_X, D_Y}} \frac{1}{m} \sum_{i=1}^m &\Bigg[ \bigg(D_Y\Big(y^{(i)}\Big) - 1\bigg)^2 \\ &+ D_Y\Big(G\Big(x^{(i)}\Big)\Big)^2 \\ &+ \bigg(D_X\Big(x^{(i)}\Big) - 1\bigg)^2 \\ &+ D_X\Big(F\Big(y^{(i)}\Big)\Big)^2 \Bigg] \end{align} We use least-squares for generators also. The generators should *descend* on the gradient, \begin{align} \nabla_{\theta_{F, G}} \frac{1}{m} \sum_{i=1}^m &\Bigg[ \bigg(D_Y\Big(G\Big(x^{(i)}\Big)\Big) - 1\bigg)^2 \\ &+ \bigg(D_X\Big(F\Big(y^{(i)}\Big)\Big) - 1\bigg)^2 \\ &+ \mathcal{L}_{cyc}(G, F) + \mathcal{L}_{identity}(G, F) \Bigg] \end{align} We use `generator_xy` for $G$ and `generator_yx$ for $F$. We use `discriminator_x$ for $D_X$ and `discriminator_y` for $D_Y$. """ # Replay buffers to keep generated samples gen_x_buffer = ReplayBuffer() gen_y_buffer = ReplayBuffer() # Loop through epochs for epoch in monit.loop(self.epochs): # Loop through the dataset for i, batch in monit.enum('Train', self.dataloader): # Move images to the device data_x, data_y = batch['x'].to(self.device), batch['y'].to( self.device) # true labels equal to $1$ true_labels = torch.ones(data_x.size(0), *self.discriminator_x.output_shape, device=self.device, requires_grad=False) # false labels equal to $0$ false_labels = torch.zeros(data_x.size(0), *self.discriminator_x.output_shape, device=self.device, requires_grad=False) # Train the generators. # This returns the generated images. gen_x, gen_y = self.optimize_generators( data_x, data_y, true_labels) # Train discriminators self.optimize_discriminator(data_x, data_y, gen_x_buffer.push_and_pop(gen_x), gen_y_buffer.push_and_pop(gen_y), true_labels, false_labels) # Save training statistics and increment the global step counter tracker.save() tracker.add_global_step(max(len(data_x), len(data_y))) # Save images at intervals batches_done = epoch * len(self.dataloader) + i if batches_done % self.sample_interval == 0: # Save models when sampling images experiment.save_checkpoint() # Sample images self.sample_images(batches_done) # Update learning rates self.generator_lr_scheduler.step() self.discriminator_lr_scheduler.step() # New line tracker.new_line()