def download_from_url(url, dst): """ kindly used from https://gist.github.com/wy193777/0e2a4932e81afc6aa4c8f7a2984f34e2 @param: url to download file @param: dst place to put the file """ file_size = int(requests.head(url).headers["Content-Length"]) if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 if first_byte >= file_size: return file_size header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, initial=first_byte, unit='B', unit_scale=True, desc=url.split('/')[-1]) req = requests.get(url, headers=header, stream=True) with(open(dst, 'ab')) as f: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) pbar.update(1024) pbar.close() return file_size
def reduce_noise( audio_clip, noise_clip, n_grad_freq=2, n_grad_time=4, n_fft=2048, win_length=2048, hop_length=512, n_std_thresh=1.5, prop_decrease=1.0, pad_clipping=True, use_tensorflow=False, verbose=False, ): """Remove noise from audio based upon a clip containing only noise Args: audio_clip (array): The first parameter. noise_clip (array): The second parameter. n_grad_freq (int): how many frequency channels to smooth over with the mask. n_grad_time (int): how many time channels to smooth over with the mask. n_fft (int): number audio of frames between STFT columns. win_length (int): Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`.. hop_length (int):number audio of frames between STFT columns. n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none) pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation verbose (bool): Whether to plot the steps of the algorithm Returns: array: The recovered signal with noise subtracted """ # load tensorflow if you are using it as a backend if use_tensorflow: use_tensorflow = load_tensorflow(verbose) if verbose: pbar = tqdm(total=7) else: pbar = None update_pbar(pbar, "STFT on noise") # STFT over noise noise_stft = _stft(noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow) noise_stft_db = _amp_to_db(np.abs(noise_stft)) # convert to dB # Calculate statistics over noise update_pbar(pbar, "STFT on signal") mean_freq_noise = np.mean(noise_stft_db, axis=1) std_freq_noise = np.std(noise_stft_db, axis=1) noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh # STFT over signal update_pbar(pbar, "STFT on signal") # pad signal with zeros to avoid extra frames being clipped if desired if pad_clipping: nsamp = len(audio_clip) audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant") sig_stft = _stft(audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow) # spectrogram of signal in dB sig_stft_db = _amp_to_db(np.abs(sig_stft)) update_pbar(pbar, "Generate mask") # calculate the threshold for each frequency/time bin db_thresh = np.repeat( np.reshape(noise_thresh, [1, len(mean_freq_noise)]), np.shape(sig_stft_db)[1], axis=0, ).T # mask if the signal is above the threshold sig_mask = sig_stft_db < db_thresh update_pbar(pbar, "Smooth mask") # Create a smoothing filter for the mask in time and frequency smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time) # convolve the mask with a smoothing filter sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow) sig_mask = sig_mask * prop_decrease update_pbar(pbar, "Apply mask") # mask the signal sig_stft_amp = mask_signal(sig_stft, sig_mask) update_pbar(pbar, "Recover signal") # recover the signal recovered_signal = _istft(sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow) # fix the recovered signal length if padding signal if pad_clipping: recovered_signal = librosa.util.fix_length(recovered_signal, nsamp) recovered_spec = _amp_to_db( np.abs( _stft( recovered_signal, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow, ))) if verbose: plot_reduction_steps( noise_stft_db, mean_freq_noise, std_freq_noise, noise_thresh, smoothing_filter, sig_stft_db, sig_mask, recovered_spec, ) return recovered_signal
def train_simultaneous_decoder_objectives(encoder, decoder, train_dl, test_dl, Y, Y_test, Y_test_avg, epochs=config.num_multi_epochs): global NUM_VOXELS # encoder.eval() encoder.eval() encoder.trainable = False decoder.train() decoder.trainable = True print(decoder) Y = torch.from_numpy(Y).float() #.cuda() # Y = Y.reshape(-1, NUM_VOXELS, 1, 1) # turn fmri into 1 x NUMVOXELS grayscale image Y_test = torch.from_numpy(Y_test).float() Y_test_avg = torch.from_numpy(Y_test_avg).float() #.cuda() test_fmri_dl = make_test_fmri_dl(Y_test_avg) msecriterion = nn.MSELoss() # maecriterion = nn.L1Loss() # ssimcriterion = piq.SSIMLoss(data_range=1.)# # ssimcriterion = pytorch_ssim.SSIM() # perceptualcriterion = lpips.LPIPS(net='alex').cuda() # mdsicriterion = piqa.MDSI().cuda() coscriterion = nn.CosineSimilarity() # enc_optim = optim.AdamW(lr=0, params=encoder.parameters()) optimizer = optim.Adam( lr=1e-3, params=list(decoder.parameters()) # + list(encoder.parameters()) , weight_decay=1e-3) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8) epoch_dec_losses = [] epoch_decenc_losses = [] epoch_encdec_losses = [] imagenet = imagenet_dl() scaler = torch.cuda.amp.GradScaler(enabled=True) objectives = ["d"] * 80 + [ "ed" ] * 20 # ["d"] * 60 + ["de"] * 10 + ["ed"] * 30 + ["gan"] * 0 for epoch in tqdm(range(epochs)): decoder.trainable = True decoder.train() dec_losses = [] decenc_losses = [] encdec_losses = [] for i, batch in enumerate(train_dl): # TODO: # - use test set of MRIs in decenc # - transformer decoder? # - imagenet val set in encdec inputs, mris, idxs = batch batch_size = len(inputs) inputs = inputs.permute(0, 3, 1, 2).float().cuda() # Y_batch = Y[idxs].cuda() # if we go to next batch in train_dl, but then pick random objective of training decenc on testset statistics fmri, we're going to be biased against training on the trainset mri->mri mapping.. Y_batch = mris.float().cuda() # not sure why it is so memory intensive to do all 3.. doing a random choice of any objective = random.choice(objectives) if epoch > 0 else "d" # enc_optim.zero_grad() # dec: # D: fMRI -> image if objective == "d": with torch.cuda.amp.autocast(): dec_outputs = decoder( Y_batch).float().cuda() # [b, c, h, w] # print(dec_outputs.shape, inputs.shape) dec_loss = msecriterion(dec_outputs, inputs) # dec_loss = mdsicriterion(dec_outputs, inputs.permute(0, 3, 1, 2)) # dec_loss += maecriterion(dec_outputs, inputs)# + ssimcriterion(dec_outputs, inputs.permute(0, 3, 1, 2)) # dec_loss -= ssimcriterion(dec_outputs, inputs) # dec_loss += perceptualcriterion(dec_outputs, inputs) # perceptualloss = perceptualcriterion.forward(dec_outputs, inputs, normalize=True).cuda() # dec_loss += 0.01 * torch.sum(perceptualloss) # msecriterion(dec_outputs.permute(0, 2, 3, 1), inputs) + -ssimcriterion(dec_outputs.permute(0, 2, 3, 1), inputs) \ # loss = dec_loss dec_losses.append(dec_loss.item()) # print("d", dec_outputs.permute(0, 2, 3, 1).shape, inputs.shape) # decenc: # E . D: mri -> mri elif objective == "de": fmri_set = random.choice(["trainset", "testset"]) if fmri_set == "testset": print(">testset fmri") del Y_batch Y_batch = next(iter(test_fmri_dl)).float().cuda() with torch.cuda.amp.autocast(): dec_outputs = decoder( Y_batch).float().cuda() # [b, c, h, w] decenc_outputs = encoder( dec_outputs) #.reshape(batch_size, NUM_VOXELS, 1, 1) decenc_loss = msecriterion(decenc_outputs, Y_batch) decenc_loss += ( 1 - torch.mean(coscriterion(decenc_outputs, Y_batch))) loss = decenc_loss decenc_losses.append(decenc_loss.item()) # print("de", decenc_outputs.shape, Y_batch.shape) # encdec: # D. E: img -> img elif objective == "ed": # enc: b h w c -> b c h w -> # dec then b c h w -> b h w c img_src = random.choice(["trainset", "trainset", "imagenet"]) if img_src == "imagenet": print(">imagenet batch") del inputs inputs = next(iter(imagenet)).float().cuda() with torch.cuda.amp.autocast(): encdec_outputs = decoder( encoder( inputs) #.reshape(batch_size, NUM_VOXELS, 1, 1) ) encdec_loss = msecriterion(encdec_outputs, inputs) # encdec_loss += perceptualcriterion(dec_outputs, inputs) # encdec_loss = mdsicriterion(encdec_outputs, inputs.permute(0, 3, 1, 2)) # encdec_loss += maecriterion(encdec_outputs, inputs) # encdec_loss -= ssimcriterion(encdec_outputs, inputs) # encdec_loss = contentloss(encdec_outputs, inputs.permute(0, 3, 1, 2)) # msecriterion(encdec_outputs, inputs) -ssimcriterion(encdec_outputs, inputs) loss = encdec_loss encdec_losses.append(encdec_loss.item()) # print("ed", encdec_outputs.shape, inputs.shape) elif objective == "gan": pass # loss = torch.sum(dec_loss) + torch.sum(decenc_loss) + torch.sum(encdec_loss) # scaled_grad_params = torch.autograd.grad(outputs=scaler.scale(loss), # inputs=decoder.parameters(), # create_graph=True # ) # inv_scale = 1./scaler.get_scale() # grad_params = [p * inv_scale for p in scaled_grad_params] # with torch.cuda.amp.autocast(): # grad_norm = 0 # for grad in grad_params: # grad_norm += grad.pow(2).sum() # grad_norm = grad_norm.sqrt() # loss = loss + grad_norm scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() # scheduler.step() optimizer.zero_grad() # enc_optim.step() print( f"epoch {epoch} mri->img: {np.mean(dec_losses)} mri->mri: {np.mean(decenc_losses)} img->img: {np.mean(encdec_losses)}" ) epoch_dec_losses.append(np.mean(dec_losses)) epoch_decenc_losses.append(np.mean(decenc_losses)) epoch_encdec_losses.append(np.mean(encdec_losses)) # if epoch % 5 == 0: # with torch.no_grad(): # eval_decoder(decoder, test_dl, X_test, Y_test_avg, avg=True) if epoch % 20 == 0: print("running through whole un-averaged testset") with torch.no_grad(): # eval_decoder(decoder, test_dl, X_test, Y_test_avg, Y_test=Y, avg=False) decode_test_set(test_dl, Y_test, X, decoder) if epoch % 20 == 0: print("dumping trainset results") with torch.no_grad(): decode_training_set(train_dl, Y, X, decoder) import matplotlib.pyplot as plt # plt.plot(range(len(epoch_dec_losses)), epoch_dec_losses, label='dec') # plt.plot(range(len(epoch_decenc_losses)), epoch_decenc_losses, label='decenc') # plt.plot(range(len(epoch_encdec_losses)), epoch_encdec_losses, label='encdec') # plt.legend() # plt.show() return decoder
from tqdm.autonotebook import tqdm import time x = range(10000) tk0 = tqdm(x, total=len(x)) for i in tk0: time.sleep(0.1)
def fit(self, X, lazy=False, collector=False): """ Build the SubCMedians model associated to dataset $X$ Parameters ---------- X : numpy.ndarray Data set to be clustered, rows represent instances (points) and columns represent features (dimensions) lazy: bool, default=False If true only tries to update the model iif the SAE increased when the datasample was updated, otherwise the optimization iteration is permormed at each sample update collector: bool, default=False If true all the models, and conserved modifications are kept. Returns ------- subcmedians Fitted subcmedians instance """ X = np.asarray(X) if self.random_state is not None: np.random.seed(self.random_state) industrius = (not lazy) if X.shape[0] < self.H: self.H = X.shape[0] - 1 self.S = X[:self.H, :].copy() self._sae_history = [] self._nb_centers_history = [] self._genome_size_history = [] if collector: self.subspaces_history = [] self.cluster_centers_history = [] self.changes_accepted_history = [] self.changes_history = [] self.gain_sae_history = [] i = self.H h = 0 self._model_candidate(self.S[h]) sae = self.sae_score(self.S) for t in tqdm(range(self.nb_iter)): # update dataset and SAE ae_old_point = self.sae_score(self.S[h, :]) point = X[i, :] self.S[h, :] = point ae_new_point = self.sae_score(self.S[h, :]) sae = sae - ae_old_point + ae_new_point # generate candidate if industrius or ae_old_point < ae_new_point: self._model_candidate(point) if collector: self.changes_history.append([ self.model._candidate_deletion, self.model._candidate_insertion ]) sae_ = self.sae_score(self.S) gain = sae - sae_ if gain >= 0: sae = sae_ self.model.apply_changes() if collector: self.changes_accepted_history.append(1) else: self.model.reverse_changes() if collector: self.changes_accepted_history.append(0) self._sae_history.append(sae) self._nb_centers_history.append(self.model.geno.nb_centers) self._genome_size_history.append(self.model.geno.G) if collector: self.subspaces_history.append(self.model.geno.to_pandas()) self.cluster_centers_history.append( self.model.pheno.to_pandas()) self.gain_sae_history.append(gain) h = (h + 1) % self.H i = (i + 1) % X.shape[0] self.cluster_centers_ = self.model.get_cluster_centers() self.subspaces_ = self.model.get_cluster_subspaces() self.sae_ = sae return (self)
def range_test( self, train_loader, val_loader=None, start_lr=None, end_lr=10, num_iter=100, step_mode="exp", smooth_f=0.05, diverge_th=5, accumulation_steps=1, ): """Performs the learning rate range test. Arguments: train_loader (torch.utils.data.DataLoader): the training set data laoder. val_loader (torch.utils.data.DataLoader, optional): if `None` the range test will only use the training loss. When given a data loader, the model is evaluated after each iteration on that dataset and the evaluation loss is used. Note that in this mode the test takes significantly longer but generally produces more precise results. Default: None. start_lr (float, optional): the starting learning rate for the range test. Default: None (uses the learning rate from the optimizer). end_lr (float, optional): the maximum learning rate to test. Default: 10. num_iter (int, optional): the number of iterations over which the test occurs. Default: 100. step_mode (str, optional): one of the available learning rate policies, linear or exponential ("linear", "exp"). Default: "exp". smooth_f (float, optional): the loss smoothing factor within the [0, 1[ interval. Disabled if set to 0, otherwise the loss is smoothed using exponential smoothing. Default: 0.05. diverge_th (int, optional): the test is stopped when the loss surpasses the threshold: diverge_th * best_loss. Default: 5. accumulation_steps (int, optional): steps for gradient accumulation. If it is 1, gradients are not accumulated. Default: 1. Example (fastai approach): >>> lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> lr_finder.range_test(dataloader, end_lr=100, num_iter=100) Example (Leslie Smith's approach): >>> lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> lr_finder.range_test(trainloader, val_loader=val_loader, end_lr=1, num_iter=100, step_mode="linear") Gradient accumulation is supported; example: >>> train_data = ... # prepared dataset >>> desired_bs, real_bs = 32, 4 # batch size >>> accumulation_steps = desired_bs // real_bs # required steps for accumulation >>> dataloader = torch.utils.data.DataLoader(train_data, batch_size=real_bs, shuffle=True) >>> acc_lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> acc_lr_finder.range_test(dataloader, end_lr=10, num_iter=100, accumulation_steps=accumulation_steps) Reference: [Training Neural Nets on Larger Batches: Practical Tips for 1-GPU, Multi-GPU & Distributed setups]( https://medium.com/huggingface/ec88c3e51255) [thomwolf/gradient_accumulation](https://gist.github.com/thomwolf/ac7a7da6b1888c2eeac8ac8b9b05d3d3) """ # Reset test results self.history = {"lr": [], "loss": []} self.best_loss = None # Move the model to the proper device self.model.to(self.device) # Check if the optimizer is already attached to a scheduler self._check_for_scheduler() # Set the starting learning rate if start_lr: self._set_learning_rate(start_lr) # Initialize the proper learning rate policy if step_mode.lower() == "exp": lr_schedule = ExponentialLR(self.optimizer, end_lr, num_iter) elif step_mode.lower() == "linear": lr_schedule = LinearLR(self.optimizer, end_lr, num_iter) else: raise ValueError( "expected one of (exp, linear), got {}".format(step_mode)) if smooth_f < 0 or smooth_f >= 1: raise ValueError("smooth_f is outside the range [0, 1[") # Create an iterator to get data batch by batch iter_wrapper = DataLoaderIterWrapper(train_loader) for iteration in tqdm(range(num_iter)): # Train on batch and retrieve loss loss, accu = self._train_batch(iter_wrapper, accumulation_steps) if val_loader: loss = self._validate(val_loader) # Update the learning rate lr_schedule.step() self.history["lr"].append(lr_schedule.get_lr()[0]) self.accuracy["lr"].append(lr_schedule.get_lr()[0]) # Track the best loss and smooth it if smooth_f is specified if iteration == 0: self.best_loss = loss self.best_Accu = accu self.max_lr = lr_schedule.get_lr()[0] else: if smooth_f > 0: loss = smooth_f * loss + ( 1 - smooth_f) * self.history["loss"][-1] if loss < self.best_loss: self.best_loss = loss if accu > self.best_Accu: self.best_Accu = accu self.max_lr = lr_schedule.get_lr()[0] # Check if the loss has diverged; if it has, stop the test self.history["loss"].append(loss) self.accuracy["Accuracy"].append(accu) #if loss > diverge_th * self.best_loss: # print("Stopping early, the loss has diverged") # break print( "Learning rate search finished. See the graph with {finder_name}.plot()" ) print("Max Accuracy = " + str(self.best_Accu) + " at LR = " + str(self.max_lr))
def evaluate(self, eval_data, load_best_model=True, model_file=None, show_progress=False): r"""Evaluate the model based on the eval data. Args: eval_data (DataLoader): the eval data load_best_model (bool, optional): whether load the best model in the training process, default: True. It should be set True, if users want to test the model after training. model_file (str, optional): the saved model file, default: None. If users want to test the previously trained model file, they can set this parameter. show_progress (bool): Show the progress of evaluate epoch. Defaults to ``False``. Returns: dict: eval result, key is the eval metric and value in the corresponding metric value. """ if not eval_data: return if load_best_model: if model_file: checkpoint_file = model_file else: checkpoint_file = self.saved_model_file checkpoint = torch.load(checkpoint_file) self.model.load_state_dict(checkpoint['state_dict']) message_output = 'Loading model structure and parameters from {}'.format( checkpoint_file) self.logger.info(message_output) self.model.eval() if eval_data.dl_type == DataLoaderType.FULL: if self.item_tensor is None: self.item_tensor = eval_data.get_item_feature().to( self.device).repeat(eval_data.step) self.tot_item_num = eval_data.dataset.item_num batch_matrix_list = [] iter_data = (tqdm( enumerate(eval_data), total=len(eval_data), desc=set_color(f"Evaluate ", 'pink'), ) if show_progress else enumerate(eval_data)) for batch_idx, batched_data in iter_data: if eval_data.dl_type == DataLoaderType.FULL: interaction, scores = self._full_sort_batch_eval(batched_data) else: interaction = batched_data batch_size = interaction.length if batch_size <= self.test_batch_size: scores = self.model.predict(interaction.to(self.device)) else: scores = self._spilt_predict(interaction, batch_size) batch_matrix = self.evaluator.collect(interaction, scores) batch_matrix_list.append(batch_matrix) result = self.evaluator.evaluate(batch_matrix_list, eval_data) return result
text = ("funny comedy music laugh humor song songs jokes musical hilarious") doc = nlp(text) for token1, token2 in combinations(doc, 2): print( f"similarity between {token1} and {token2} is {token1.similarity(token2)}" ) # %% import pandas as pd from gensim.models import Word2Vec from tqdm.autonotebook import tqdm data = pd.read_csv("train.csv") sentences = [] for review in tqdm(data["review"]): doc = nlp(review, disable=["tagger", "ner"]) for sent in doc.sents: sentences.append([t.text for t in sent]) # train a Word2vec model model = Word2Vec(sentences) # %% for token1, token2 in combinations(text.split(), 2): print( f"similarity between {token1} and {token2} is {model.wv.similarity(token1, token2)}" ) # %%
final_output = [] # Instantiate TweetDataset with the test data test_dataset = TweetDataset(tweet=df_test.text.values, sentiment=df_test.sentiment.values, selected_text=df_test.selected_text.values) # Instantiate DataLoader with `test_dataset` data_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=VALID_BATCH_SIZE, num_workers=1) # Turn of gradient calculations with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader), ncols=80) # Predict the span containing the sentiment for each batch for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"].numpy() ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long)
def generate_tiles_dataset(self, output_dir=None, save_format="jpg", remove_first=True): """ Actually generates training images from the dataset.sampled_tiles (= regions of interest) The filestructure is compatible with keras.ImageDataGenerator.flow_from_directory() method For more information on how to parse this, check this script: https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d In summary, this is our directory structure: ```markdown output_dir/ aircrafts/ ac001.jpg ac002.jpg ... background/ bg001.jpg bg002.jpg ... ``` Args: output_dir(str): the output path save_format: "jpg" the image format remove_first(bool): erase output dir first? Returns: """ LOGGER.info( "Generating a dataset of tiles at location {}".format(output_dir)) for label in self.found_labels: if remove_first: shutil.rmtree(os.path.join(output_dir, label)) if not os.path.exists(os.path.join(output_dir, label)): os.makedirs(os.path.join(output_dir, label)) def _generate_tiles(item, tiles): image = item.image tiles = list_utils.filter_tiles_by_item(tiles, item.key) for tile in tiles: tile_data = tile.get_data(image) tile_label = tile.label tile_basename = "{}_{}.{}".format(item.key, tile.key, save_format) io.imsave(os.path.join(output_dir, tile_label, tile_basename), tile_data) items = self.items sampled_tiles = self.sampled_tiles LOGGER.info("Dumping tiles to {}".format(output_dir)) for item in tqdm(items, desc="Saving tiles to {}".format(output_dir)): _generate_tiles(item, sampled_tiles)
def range_test( self, train_loader, val_loader=None, end_lr=10, num_iter=100, step_mode="exp", smooth_f=0.05, diverge_th=5, ): """Performs the learning rate range test. Arguments: train_loader (torch.utils.data.DataLoader): the training set data laoder. val_loader (torch.utils.data.DataLoader, optional): if `None` the range test will only use the training loss. When given a data loader, the model is evaluated after each iteration on that dataset and the evaluation loss is used. Note that in this mode the test takes significantly longer but generally produces more precise results. Default: None. end_lr (float, optional): the maximum learning rate to test. Default: 10. num_iter (int, optional): the number of iterations over which the test occurs. Default: 100. step_mode (str, optional): one of the available learning rate policies, linear or exponential ("linear", "exp"). Default: "exp". smooth_f (float, optional): the loss smoothing factor within the [0, 1[ interval. Disabled if set to 0, otherwise the loss is smoothed using exponential smoothing. Default: 0.05. diverge_th (int, optional): the test is stopped when the loss surpasses the threshold: diverge_th * best_loss. Default: 5. """ # Reset test results self.history = {"lr": [], "loss": []} self.best_loss = None # Move the model to the proper device self.model.to(self.device) # Initialize the proper learning rate policy if step_mode.lower() == "exp": lr_schedule = ExponentialLR(self.optimizer, end_lr, num_iter) elif step_mode.lower() == "linear": lr_schedule = LinearLR(self.optimizer, end_lr, num_iter) else: raise ValueError( "expected one of (exp, linear), got {}".format(step_mode)) if smooth_f < 0 or smooth_f >= 1: raise ValueError("smooth_f is outside the range [0, 1[") # Create an iterator to get data batch by batch iterator = iter(train_loader) for iteration in tqdm(range(num_iter)): # Get a new set of inputs and labels try: inputs, labels = next(iterator) except StopIteration: iterator = iter(train_loader) inputs, labels = next(iterator) # Train on batch and retrieve loss loss = self._train_batch(inputs, labels) if val_loader: loss = self._validate(val_loader) # Update the learning rate lr_schedule.step() self.history["lr"].append(lr_schedule.get_lr()[0]) # Track the best loss and smooth it if smooth_f is specified if iteration == 0: self.best_loss = loss else: if smooth_f > 0: loss = smooth_f * loss + ( 1 - smooth_f) * self.history["loss"][-1] if loss < self.best_loss: self.best_loss = loss # Check if the loss has diverged; if it has, stop the test self.history["loss"].append(loss) if loss > diverge_th * self.best_loss: print("Stopping early, the loss has diverged") break print( "Learning rate search finished. See the graph with {finder_name}.plot()" )
def _parallel_analysis_component_selection(self, timeseries, L, K, rank, singular_values, iterations=100): ''' Performs parallel analysis to help select the appropriate number of MSSA components to keep. The algorithm follows these steps: 1. Calculate the eigenvalues via SVD/PCA on your real dataset. 2. For a given number of iterations: 3. Construct a random noise matrix the same shape as your real data. 4. Perform decomposition of the random noise data. 5. Calculate the eigenvalues for the noise data and track them per iteration. 6. Calculate the percentile at a user-specified threshold of the noise eigenvalues at each position. 7. Select only the number of components in the real data whose eigenvalues exceed those at the specified percentile of the noise eigenvalues. ''' def _bootstrap_eigenvalues(ts_std, ts_shape, L, K, rank): # create random normal differences with equivalent standard deviations ts_rnorm = np.random.normal( np.zeros(ts_shape[1]), ts_std, size=ts_shape ) # create noise trajectory matrix rnorm_trajectory_matrix = ts_matrix_to_trajectory_matrix( ts_rnorm, L, K ) # decompose the noise trajectory matrix U, s, V, rank = decompose_trajectory_matrix( rnorm_trajectory_matrix, rank, svd_method=self.svd_method ) # return the eigenvalues return s ** 2 # calculate real eigenvalues eigenvalues = singular_values ** 2 # calculate standard deviations column-wise ts_std = np.std(timeseries, axis=0) # bootstrap the eigenvalues noise_eigenvalues = [ _bootstrap_eigenvalues( ts_std, timeseries.shape, L, K, rank ) for i in tqdm(range(iterations), disable=(not self.verbose)) ] noise_eigenvalues = np.concatenate(noise_eigenvalues, axis=0) # calculate the 95th percentile of the noise eigenvalues eig_pctl = np.percentile(noise_eigenvalues, 95, axis=0) # find the first index where the noise eigenvalue 95th percentile is >= real adjusted_rank = np.where(eig_pctl > eigenvalues)[0][0] return adjusted_rank
def transform(self, df): return pd.Series({ idx: self.transform_row_(row) for idx, row in tqdm(df.iterrows()) })
def start_training(self): if self.system_dict["params"]["num_gpus"] == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) self.system_dict["params"]["saved_path"] = self.system_dict["params"]["saved_path"] + "/" + self.system_dict["params"]["project_name"] + "/"; self.system_dict["params"]["log_path"] = self.system_dict["params"]["log_path"] + "/" + self.system_dict["params"]["project_name"] + "/tensorboard/"; os.makedirs(self.system_dict["params"]["saved_path"], exist_ok=True) os.makedirs(self.system_dict["params"]["log_path"], exist_ok=True) training_params = {'batch_size': self.system_dict["params"]["batch_size"], 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': self.system_dict["params"]["num_workers"]} val_params = {'batch_size': self.system_dict["params"]["batch_size"], 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': self.system_dict["params"]["num_workers"]} input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = CocoDataset(self.system_dict["dataset"]["train"]["root_dir"], self.system_dict["dataset"]["train"]["coco_dir"], self.system_dict["dataset"]["train"]["img_dir"], set_dir=self.system_dict["dataset"]["train"]["set_dir"], transform=transforms.Compose([Normalizer(mean=self.system_dict["params"]["mean"], std=self.system_dict["params"]["std"]), Augmenter(), Resizer(input_sizes[self.system_dict["params"]["compound_coef"]])])) training_generator = DataLoader(training_set, **training_params) if(self.system_dict["dataset"]["val"]["status"]): val_set = CocoDataset(self.system_dict["dataset"]["val"]["root_dir"], self.system_dict["dataset"]["val"]["coco_dir"], self.system_dict["dataset"]["val"]["img_dir"], set_dir=self.system_dict["dataset"]["val"]["set_dir"], transform=transforms.Compose([Normalizer(self.system_dict["params"]["mean"], self.system_dict["params"]["std"]), Resizer(input_sizes[self.system_dict["params"]["compound_coef"]])])) val_generator = DataLoader(val_set, **val_params) print(""); print(""); model = EfficientDetBackbone(num_classes=len(self.system_dict["params"]["obj_list"]), compound_coef=self.system_dict["params"]["compound_coef"], ratios=eval(self.system_dict["params"]["anchors_ratios"]), scales=eval(self.system_dict["params"]["anchors_scales"])); os.makedirs("pretrained_weights", exist_ok=True); if(self.system_dict["params"]["compound_coef"] == 0): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d0.pth -O " + \ self.system_dict["params"]["load_weights"]; os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 1): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d1.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 2): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d2.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 3): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d3.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 4): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d4.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 5): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d5.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 6): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d6.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); elif(self.system_dict["params"]["compound_coef"] == 7): if(not os.path.isfile(self.system_dict["params"]["load_weights"])): print("Downloading weights"); cmd = "wget https://github.com/zylo117/Yet-Another-Efficient-Pytorch/releases/download/1.0/efficientdet-d7.pth -O " + \ self.system_dict["params"]["load_weights"] os.system(cmd); # load last weights if self.system_dict["params"]["load_weights"] is not None: if self.system_dict["params"]["load_weights"].endswith('.pth'): weights_path = self.system_dict["params"]["load_weights"] else: weights_path = get_last_weights(self.system_dict["params"]["saved_path"]) try: last_step = int(os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.') print(f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}') else: last_step = 0 print('[Info] initializing weights...') init_weights(model) print(""); print(""); # freeze backbone if train head_only if self.system_dict["params"]["head_only"]: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') print(""); print(""); if self.system_dict["params"]["num_gpus"] > 1 and self.system_dict["params"]["batch_size"] // self.system_dict["params"]["num_gpus"] < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter(self.system_dict["params"]["log_path"] + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') model = ModelWithLoss(model, debug=self.system_dict["params"]["debug"]) if self.system_dict["params"]["num_gpus"] > 0: model = model.cuda() if self.system_dict["params"]["num_gpus"] > 1: model = CustomDataParallel(model, self.system_dict["params"]["num_gpus"]) if use_sync_bn: patch_replication_callback(model) if self.system_dict["params"]["optim"] == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), self.system_dict["params"]["lr"]) else: optimizer = torch.optim.SGD(model.parameters(), self.system_dict["params"]["lr"], momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(self.system_dict["params"]["num_epochs"]): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if self.system_dict["params"]["num_gpus"] == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=self.system_dict["params"]["obj_list"]) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}'.format( step, epoch, self.system_dict["params"]["num_epochs"], iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % self.system_dict["params"]["save_interval"] == 0 and step > 0: self.save_checkpoint(model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth') #print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) if epoch % self.system_dict["params"]["val_interval"] == 0 and self.system_dict["dataset"]["val"]["status"]: print("Running validation"); model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if self.system_dict["params"]["num_gpus"] == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=self.system_dict["params"]["obj_list"]) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}'.format( epoch, self.system_dict["params"]["num_epochs"], cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + self.system_dict["params"]["es_min_delta"] < best_loss: best_loss = loss best_epoch = epoch self.save_checkpoint(model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth') model.train() # Early stopping if epoch - best_epoch > self.system_dict["params"]["es_patience"] > 0: print('[Info] Stop training at epoch {}. The lowest loss achieved is {}'.format(epoch, best_loss)) break except KeyboardInterrupt: self.save_checkpoint(model, f'efficientdet-d{self.system_dict["params"]["compound_coef"]}_trained.pth') writer.close() writer.close() print(""); print(""); print("Training complete");
def train(model, train_dataloader, epochs, lr, steps_til_summary, epochs_til_checkpoint, model_dir, loss_fn, summary_fn=None, val_dataloader=None, double_precision=False, clip_grad=False, use_lbfgs=False, loss_schedules=None, validation_fn=None, start_epoch=0): optim = torch.optim.Adam(lr=lr, params=model.parameters()) # copy settings from Raissi et al. (2019) and here # https://github.com/maziarraissi/PINNs if use_lbfgs: optim = torch.optim.LBFGS(lr=lr, params=model.parameters(), max_iter=50000, max_eval=50000, history_size=50, line_search_fn='strong_wolfe') # Load the checkpoint if required if start_epoch > 0: # Load the model and start training from that point onwards model_path = os.path.join(model_dir, 'checkpoints', 'model_epoch_%04d.pth' % start_epoch) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model']) model.train() optim.load_state_dict(checkpoint['optimizer']) optim.param_groups[0]['lr'] = lr assert (start_epoch == checkpoint['epoch']) else: # Start training from scratch if os.path.exists(model_dir): val = input("The model directory %s exists. Overwrite? (y/n)" % model_dir) if val == 'y': shutil.rmtree(model_dir) os.makedirs(model_dir) summaries_dir = os.path.join(model_dir, 'summaries') utils.cond_mkdir(summaries_dir) checkpoints_dir = os.path.join(model_dir, 'checkpoints') utils.cond_mkdir(checkpoints_dir) writer = SummaryWriter(summaries_dir) total_steps = 0 with tqdm(total=len(train_dataloader) * epochs) as pbar: train_losses = [] for epoch in range(start_epoch, epochs): if not epoch % epochs_til_checkpoint and epoch: # Saving the optimizer state is important to produce consistent results checkpoint = { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optim.state_dict() } torch.save( checkpoint, os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) # torch.save(model.state_dict(), # os.path.join(checkpoints_dir, 'model_epoch_%04d.pth' % epoch)) np.savetxt( os.path.join(checkpoints_dir, 'train_losses_epoch_%04d.txt' % epoch), np.array(train_losses)) if validation_fn is not None: validation_fn(model, checkpoints_dir, epoch) for step, (model_input, gt) in enumerate(train_dataloader): start_time = time.time() if torch.cuda.is_available(): model_input = { key: value.cuda() for key, value in model_input.items() } gt = {key: value.cuda() for key, value in gt.items()} else: model_input = { key: value.cpu() for key, value in model_input.items() } gt = {key: value.cpu() for key, value in gt.items()} if double_precision: model_input = { key: value.double() for key, value in model_input.items() } gt = {key: value.double() for key, value in gt.items()} if use_lbfgs: def closure(): optim.zero_grad() model_output = model(model_input) losses = loss_fn(model_output, gt) train_loss = 0. for loss_name, loss in losses.items(): train_loss += loss.mean() train_loss.backward() return train_loss optim.step(closure) model_output = model(model_input) losses = loss_fn(model_output, gt) # import ipdb; ipdb.set_trace() train_loss = 0. for loss_name, loss in losses.items(): single_loss = loss.mean() if loss_schedules is not None and loss_name in loss_schedules: writer.add_scalar( loss_name + "_weight", loss_schedules[loss_name](total_steps), total_steps) single_loss *= loss_schedules[loss_name](total_steps) writer.add_scalar(loss_name, single_loss, total_steps) train_loss += single_loss train_losses.append(train_loss.item()) writer.add_scalar("total_train_loss", train_loss, total_steps) if not total_steps % steps_til_summary: torch.save( model.state_dict(), os.path.join(checkpoints_dir, 'model_current.pth')) # summary_fn(model, model_input, gt, model_output, writer, total_steps) if not use_lbfgs: optim.zero_grad() train_loss.backward() if clip_grad: if isinstance(clip_grad, bool): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_grad) optim.step() pbar.update(1) if not total_steps % steps_til_summary: tqdm.write( "Epoch %d, Total loss %0.6f, iteration time %0.6f" % (epoch, train_loss, time.time() - start_time)) if val_dataloader is not None: print("Running validation set...") model.eval() with torch.no_grad(): val_losses = [] for (model_input, gt) in val_dataloader: model_output = model(model_input) val_loss = loss_fn(model_output, gt) val_losses.append(val_loss) writer.add_scalar("val_loss", np.mean(val_losses), total_steps) model.train() total_steps += 1 torch.save(model.state_dict(), os.path.join(checkpoints_dir, 'model_final.pth')) np.savetxt(os.path.join(checkpoints_dir, 'train_losses_final.txt'), np.array(train_losses))
def train(opt): params = Params(f'projects/{opt.project}.yml') if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536] training_set = CocoAlbumentationsDataset( root_dir=os.path.join(opt.data_path, params.project_name), set=params.train_set, transform=A.Compose( [ eval(params.augmentation[i]) for i in range(len(params.augmentation)) ], bbox_params=A.BboxParams(format='coco', label_fields=['category_ids'], min_visibility=0.2), ), img_size=input_sizes[opt.compound_coef]) training_generator = DataLoader(training_set, **training_params) val_set = CocoDataset(root_dir=os.path.join(opt.data_path, params.project_name), set=params.val_set, transform=transforms.Compose([ Normalizer(mean=params.mean, std=params.std), Resizer(input_sizes[opt.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales)) # load last weights if opt.load_weights is not None: weights_path = opt.load_weights try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print(f'[Info] loaded weights: {os.path.basename(weights_path)}') else: print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr, weight_decay=opt.weight_decay) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, weight_decay=opt.weight_decay, nesterov=True) epoch = 0 best_loss = 1e5 best_epoch = 0 step = 0 model.train() num_iter_per_epoch = len(training_generator) ##################################################################################### # Check the validation loss for the first time # to determine the best_loss more accurately if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): None else: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) best_loss = cls_loss + reg_loss model.train() ######################################################################################## try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_last.pth') print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_best.pth') model.train() except KeyboardInterrupt: save_checkpoint(model, f'efficientdet-d{opt.compound_coef}_last.pth') writer.close() writer.close()
def train_fn(data_loader, model, optimizer, device, scheduler=None): """ Trains the bert model on the twitter data """ # Set model to training mode (dropout + sampled batch norm is activated) model.train() losses = utils.AverageMeter() jaccards = utils.AverageMeter() # Set tqdm to add loading screen and set the length tk0 = tqdm(data_loader, total=len(data_loader), ncols=80) # Train the model on each batch for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] targets_start = d["targets_start"] targets_end = d["targets_end"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"] # Move ids, masks, and targets to gpu while setting as torch.long ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) # Reset gradients model.zero_grad() # Use ids, masks, and token types as input to the model # Predict logits for each of the input tokens for each batch outputs_start, outputs_end = model( ids=ids, mask=mask, token_type_ids=token_type_ids, ) # (bs x SL), (bs x SL) # Calculate batch loss based on CrossEntropy loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end) # Calculate gradients based on loss loss.backward() # Adjust weights based on calculated gradients optimizer.step() # Update scheduler scheduler.step() # Apply softmax to the start and end logits # This squeezes each of the logits in a sequence to a value between 0 and 1, while ensuring that they sum to 1 # This is similar to the characteristics of "probabilities" outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy() # Calculate the jaccard score based on the predictions for this batch jaccard_scores = [] for px, tweet in enumerate(orig_tweet): selected_tweet = orig_selected[px] tweet_sentiment = sentiment[px] jaccard_score, _ = calculate_jaccard_score( original_tweet= tweet, # Full text of the px'th tweet in the batch target_string=selected_tweet, # Span containing the specified sentiment for the px'th tweet in the batch sentiment_val= tweet_sentiment, # Sentiment of the px'th tweet in the batch idx_start=np.argmax( outputs_start[px, :] ), # Predicted start index for the px'th tweet in the batch idx_end=np.argmax( outputs_end[px, :] ), # Predicted end index for the px'th tweet in the batch offsets=offsets[ px] # Offsets for each of the tokens for the px'th tweet in the batch ) # if tweet_sentiment == 'neutral': # continue jaccard_scores.append(jaccard_score) # Update the jaccard score and loss # For details, refer to `AverageMeter` in https://www.kaggle.com/abhishek/utils avg = np.mean(jaccard_scores) if len(jaccard_scores) else 0 jaccards.update(avg, len(jaccard_scores)) losses.update(loss.item(), ids.size(0)) # Print the average loss and jaccard score at the end of each batch tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
def train(opt): params = Params(f'projects/{opt.project}.yml') global_validation_it = 0 if params.num_gpus == 0: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if torch.cuda.is_available(): torch.cuda.manual_seed(42) else: torch.manual_seed(42) opt.saved_path = opt.saved_path + f'/{params.project_name}/' opt.log_path = opt.log_path + f'/{params.project_name}/tensorboard/' os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) training_params = { 'batch_size': opt.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': TUMuchTrafficDataset.collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': opt.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': TUMuchTrafficDataset.collater, 'num_workers': opt.num_workers } advprop = opt.advprop if advprop: # for models using advprop pretrained weights normalize = transforms.Lambda( lambda mem: { "img": (mem["img"] * 2.0 - 1.0).astype(np.float32), "annot": mem["annot"] }) else: # for other models normalize = Normalizer(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) tfs = transforms.Compose([ TopCutter(886), transforms.RandomApply([Negate()], p=0.1), transforms.RandomApply([ContrastEnhancementWithNoiseReduction()], p=0.1), Resize(384), RandomCrop(384, 768), normalize, HorizontalFlip(prob=0.5), transforms.RandomApply([AddGaussianNoise(0, 2.55)], p=0.5), transforms.RandomApply([AddSaltAndPepperNoise(prob=0.0017)], p=0.5), ToTensor() ]) tfrecord_paths = [opt.data_path ] if opt.data_path.endswith(".tfrecord") else [ str(x.absolute()) for x in Path(opt.data_path).rglob('*.tfrecord') ] training_set = TUMuchTrafficDataset(tfrecord_paths=tfrecord_paths, transform=tfs) training_generator = DataLoader(training_set, **training_params) tfrecord_paths = [opt.data_path ] if opt.data_path.endswith(".tfrecord") else [ str(x.absolute()) for x in Path(opt.val_path).rglob('*.tfrecord') ] val_set = TUMuchTrafficDataset(tfrecord_paths=tfrecord_paths, transform=tfs) val_generator = DataLoader(val_set, **val_params) if not opt.load_backbone: load_weights = False else: load_weights = True model = EfficientDetBackbone(num_classes=len(params.obj_list), compound_coef=opt.compound_coef, ratios=eval(params.anchors_ratios), scales=eval(params.anchors_scales), load_weights=load_weights) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("# Params: {:08d}".format(pytorch_total_params)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, this might be because you load a pretrained weights with different number of classes. The rest of the weights should be loaded already.' ) print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if opt.head_only: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # freeze backbone (only efficientnet) if train no_effnet if opt.no_effnet: def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("# Training Parameters: {:06}".format(pytorch_total_params)) # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if params.num_gpus > 1 and opt.batch_size // params.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False writer = SummaryWriter( opt.log_path + f'/{datetime.datetime.now().strftime("%Y%m%d-%H%M")}/') # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if params.num_gpus > 0: model = model.cuda() if params.num_gpus > 1: model = CustomDataParallel(model, params.num_gpus) if use_sync_bn: patch_replication_callback(model) if opt.optim == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), opt.lr) else: optimizer = torch.optim.SGD(model.parameters(), opt.lr, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1e6, verbose=True) # use apex for mixed precision training # model, optimizer = amp.initialize(model, optimizer) epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(opt.num_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for it, data in enumerate(progress_bar): if it < step - last_epoch * num_iter_per_epoch: progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() global_validation_it += 1 optimizer.zero_grad() cls_loss, reg_loss = model(imgs, annot) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. Total loss: {:.5f}' .format(step, epoch, opt.num_epochs, it + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classfication_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 if step % opt.save_interval == 0 and step > 0: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) print('checkpoint...') except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) # sleep for 30 seconds, to reduce overheating import time time.sleep(30) if epoch % opt.val_interval == 0: model.eval() loss_regression_ls = [] loss_classification_ls = [] for it, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if params.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() if it < 12: plot_tensorboard(imgs, annot, model, writer, global_validation_it, it, "") global_validation_it += 1 if params.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model(imgs, annot, obj_list=params.obj_list) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss print( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}. Total loss: {:1.5f}' .format(epoch, opt.num_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classfication_loss', {'val': cls_loss}, step) if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth' ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break except KeyboardInterrupt: save_checkpoint( model, f'efficientdet-d{opt.compound_coef}_{epoch}_{step}.pth') writer.close() writer.close()
def eval_fn(data_loader, model, device): """ Evaluation function to predict on the test set """ # Set model to evaluation mode # I.e., turn off dropout and set batchnorm to use overall mean and variance (from training), rather than batch level mean and variance # Reference: https://github.com/pytorch/pytorch/issues/5406 model.eval() losses = utils.AverageMeter() jaccards = utils.AverageMeter() # Turns off gradient calculations (https://datascience.stackexchange.com/questions/32651/what-is-the-use-of-torch-no-grad-in-pytorch) with torch.no_grad(): tk0 = tqdm(data_loader, total=len(data_loader), ncols=80) # Make predictions and calculate loss / jaccard score for each batch for bi, d in enumerate(tk0): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] sentiment = d["sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] targets_start = d["targets_start"] targets_end = d["targets_end"] offsets = d["offsets"].numpy() # Move ids, masks, and targets to gpu while setting as torch.long ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) # Move tensors to GPU for faster matrix calculations ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.long) targets_end = targets_end.to(device, dtype=torch.long) # Predict logits for start and end indexes outputs_start, outputs_end = model(ids=ids, mask=mask, token_type_ids=token_type_ids) # Calculate loss for the batch loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end) # Apply softmax to the predicted logits for the start and end indexes # This converts the "logits" to "probability-like" scores outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy() outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy() # Calculate jaccard scores for each tweet in the batch jaccard_scores = [] for px, tweet in enumerate(orig_tweet): selected_tweet = orig_selected[px] tweet_sentiment = sentiment[px] jaccard_score, _ = calculate_jaccard_score( original_tweet=tweet, target_string=selected_tweet, sentiment_val=tweet_sentiment, idx_start=np.argmax(outputs_start[px, :]), idx_end=np.argmax(outputs_end[px, :]), offsets=offsets[px]) jaccard_scores.append(jaccard_score) # Update running jaccard score and loss jaccards.update(np.mean(jaccard_scores), ids.size(0)) losses.update(loss.item(), ids.size(0)) # Print the running average loss and jaccard score tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg) print(f"Jaccard = {jaccards.avg}") return jaccards.avg
model = MnistResNet().to(device) dataset = MnistDataset() train_loader, val_loader, _ = dataset.data_loaders(batch_size=256) losses = [] loss_function = nn.CrossEntropyLoss() optimizer = optim.Adadelta(model.parameters()) batches = len(train_loader) val_batches = len(val_loader) # training loop + eval loop for epoch in range(epochs): total_loss = 0 progress = tqdm(enumerate(train_loader), desc="Loss: ", total=batches) model.train() for i, data in progress: X, y = data[0].to(device), data[1].to(device) model.zero_grad() outputs = model(X) loss = loss_function(outputs, y) loss.backward() optimizer.step() current_loss = loss.item() total_loss += current_loss progress.set_description("Loss: {:.4f}".format(total_loss / (i + 1)))
def keyword_categories(file_info: List[FileInfo], category_words): return [ kw_cat(fi.text + " " + " ".join(news.simple_tokenize(fi.url)), category_words) for fi in tqdm(file_info, desc="assigning keyword-based labels") ]
result = len(list(doc.sents)) return result # %% from collections import Counter number_of_sents, pos_adj_counter, neg_adj_counter, ent_counter = ( [], Counter(), Counter(), Counter(), ) for review, sentiment in tqdm(data.to_records(index=False)): doc = nlp(review) adjs = get_adjs(doc) ents = get_ents(doc) # what to do with number_of_sents number_of_sents.append( get_number_of_sents(doc) ) # what to do with pos_adj_counter and neg_adj_counter for adj in adjs: counter = pos_adj_counter if sentiment == "positive" else neg_adj_counter counter[adj.lower()] += 1
def fit( self, train_dataloader: DataLoader, evaluator: SentenceEvaluator = None, epochs: int = 1, loss_fct=None, acitvation_fct=nn.Identity(), scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = { 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False }, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, use_amp: bool = False, callback: Callable[[float, int, int], None] = None, ): """ Train the model with the given training objective Each training objective is sampled in turn for one batch. We sample only as many batches from each objective as there are in the smallest one to make sure of equal training with each dataset. :param train_dataloader: DataLoader with training InputExamples :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc. :param epochs: Number of epochs for training :param loss_fct: Which loss function to use for training. If None, will use nn.BCEWithLogitsLoss() if self.config.num_labels == 1 else nn.CrossEntropyLoss() :param acitvation_fct: Activation function applied on top of logits output of model. :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero. :param optimizer_class: Optimizer :param optimizer_params: Optimizer parameters :param weight_decay: Weight decay for model parameters :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps :param output_path: Storage path for the model and evaluation files :param save_best_model: If true, the best model (according to evaluator) is stored at output_path :param max_grad_norm: Used for gradient normalization. :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0 :param callback: Callback function that is invoked after each evaluation. It must accept the following three parameters in this order: `score`, `epoch`, `steps` """ train_dataloader.collate_fn = self.smart_batching_collate if use_amp: from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() self.model.to(self._target_device) if output_path is not None: os.makedirs(output_path, exist_ok=True) self.best_score = -9999999 num_train_steps = int(len(train_dataloader) * epochs) # Prepare optimizers param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) if isinstance(scheduler, str): scheduler = SentenceTransformer._get_scheduler( optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) if loss_fct is None: loss_fct = nn.BCEWithLogitsLoss( ) if self.config.num_labels == 1 else nn.CrossEntropyLoss() skip_scheduler = False for epoch in trange(epochs, desc="Epoch"): training_steps = 0 self.model.zero_grad() self.model.train() for features, labels in tqdm(train_dataloader, desc="Iteration", smoothing=0.05): if use_amp: with autocast(): model_predictions = self.model(**features, return_dict=True) logits = acitvation_fct(model_predictions.logits) if self.config.num_labels == 1: logits = logits.view(-1) loss_value = loss_fct(logits, labels) scale_before_step = scaler.get_scale() scaler.scale(loss_value).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) scaler.step(optimizer) scaler.update() skip_scheduler = scaler.get_scale() != scale_before_step else: model_predictions = self.model(**features, return_dict=True) logits = acitvation_fct(model_predictions.logits) if self.config.num_labels == 1: logits = logits.view(-1) loss_value = loss_fct(logits, labels) loss_value.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() if not skip_scheduler: scheduler.step() training_steps += 1 if evaluator is not None and evaluation_steps > 0 and training_steps % evaluation_steps == 0: self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback) self.model.zero_grad() self.model.train() if evaluator is not None: self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback)
def _estimate_transforms(self, nsamples): """ Uses block matrix inversion identities to quickly estimate transforms. After a bit of matrix math we can isolate a transform matrix (# features x # features) that is independent of any sample we are explaining. It is the result of averaging over all feature permutations, but we just use a fixed number of samples to estimate the value. TODO: Do a brute force enumeration when # feature subsets is less than nsamples. This could happen through a recursive method that uses the same block matrix inversion as below. """ M = len(self.coef) mean_transform = np.zeros((M, M)) x_transform = np.zeros((M, M)) inds = np.arange(M, dtype=np.int) for _ in tqdm(range(nsamples), "Estimating transforms"): np.random.shuffle(inds) cov_inv_SiSi = np.zeros((0, 0)) cov_Si = np.zeros((M, 0)) for j in range(M): i = inds[j] # use the last Si as the new S cov_S = cov_Si cov_inv_SS = cov_inv_SiSi # get the new cov_Si cov_Si = self.cov[:, inds[:j + 1]] # compute the new cov_inv_SiSi from cov_inv_SS d = cov_Si[i, :-1].T t = np.matmul(cov_inv_SS, d) Z = self.cov[i, i] u = Z - np.matmul(t.T, d) cov_inv_SiSi = np.zeros((j + 1, j + 1)) if j > 0: cov_inv_SiSi[:-1, :-1] = cov_inv_SS + np.outer(t, t) / u cov_inv_SiSi[:-1, -1] = cov_inv_SiSi[-1, :-1] = -t / u cov_inv_SiSi[-1, -1] = 1 / u # + coef @ (Q(bar(Sui)) - Q(bar(S))) mean_transform[i, i] += self.coef[i] # + coef @ R(Sui) coef_R_Si = np.matmul( self.coef[inds[j + 1:]], np.matmul(cov_Si, cov_inv_SiSi)[inds[j + 1:]]) mean_transform[i, inds[:j + 1]] += coef_R_Si # - coef @ R(S) coef_R_S = np.matmul(self.coef[inds[j:]], np.matmul(cov_S, cov_inv_SS)[inds[j:]]) mean_transform[i, inds[:j]] -= coef_R_S # - coef @ (Q(Sui) - Q(S)) x_transform[i, i] += self.coef[i] # + coef @ R(Sui) x_transform[i, inds[:j + 1]] += coef_R_Si # - coef @ R(S) x_transform[i, inds[:j]] -= coef_R_S mean_transform /= nsamples x_transform /= nsamples return mean_transform, x_transform
def predict(self, sentences: List[List[str]], batch_size: int = 32, show_progress_bar: bool = None, num_workers: int = 0, activation_fct=None, apply_softmax=False, convert_to_numpy: bool = True, convert_to_tensor: bool = False): """ Performs predicts with the CrossEncoder on the given sentence pairs. :param sentences: A list of sentence pairs [[Sent1, Sent2], [Sent3, Sent4]] :param batch_size: Batch size for encoding :param show_progress_bar: Output progress bar :param num_workers: Number of workers for tokenization :param activation_fct: Activation function applied on the logits output of the CrossEncoder. If None, nn.Sigmoid() will be used if num_labels=1, else nn.Identity :param convert_to_numpy: Convert the output to a numpy matrix. :param apply_softmax: If there are more than 2 dimensions and apply_softmax=True, applies softmax on the logits output :param convert_to_tensor: Conver the output to a tensor. :return: Predictions for the passed sentence pairs """ input_was_string = False if isinstance( sentences[0], str): # Cast an individual sentence to a list with length 1 sentences = [sentences] input_was_string = True inp_dataloader = DataLoader( sentences, batch_size=batch_size, collate_fn=self.smart_batching_collate_text_only, num_workers=num_workers, shuffle=False) if show_progress_bar is None: show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) iterator = inp_dataloader if show_progress_bar: iterator = tqdm(inp_dataloader, desc="Batches") if activation_fct is None: activation_fct = nn.Sigmoid( ) if self.config.num_labels == 1 else nn.Identity() pred_scores = [] self.model.eval() self.model.to(self._target_device) with torch.no_grad(): for features in iterator: model_predictions = self.model(**features, return_dict=True) logits = activation_fct(model_predictions.logits) if apply_softmax and len(logits[0]) > 1: logits = torch.nn.functional.softmax(logits, dim=1) pred_scores.extend(logits) if self.config.num_labels == 1: pred_scores = [score[0] for score in pred_scores] if convert_to_tensor: pred_scores = torch.stack(pred_scores) elif convert_to_numpy: pred_scores = np.asarray( [score.cpu().detach().numpy() for score in pred_scores]) if input_was_string: pred_scores = pred_scores[0] return pred_scores
parser.add_argument('-o', type=Path) args = parser.parse_args() logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) logging.info(f'Using {args.storage} storage 💾') if args.o is not None: save_dir = args.o save_dir.mkdir(exist_ok=True) storage = LocalStorage() if args.storage == 'local' else AWSSTorage() if args.storage == 'local': logging.info(f'Store root={storage.root}') override = True bar = tqdm(zoo_source.items()) uploading_bar = tqdm() for key, src_def in bar: bar.set_description(key) if src_def is None: # it means I was lazy and I meant to use timm src_def = partial(timm.create_model, key, pretrained=True) if key not in storage or override: src, dst = src_def(), AutoModel.from_name(key) cloned = clone_model(src, dst) storage(key, cloned, uploading_bar) # uploading_bar.update(0)
def range_test( self, train_loader, val_loader=None, start_lr=None, end_lr=10, num_iter=100, step_mode="exp", smooth_f=0.05, diverge_th=5, accumulation_steps=1, non_blocking_transfer=True, ): """Performs the learning rate range test. Arguments: train_loader (`torch.utils.data.DataLoader` or child of `TrainDataLoaderIter`, optional): the training set data loader. If your dataset (data loader) returns a tuple (inputs, labels,*) then Pytorch data loader object can be provided. However, if a dataset returns different outputs e.g. dicts, then you should inherit from `TrainDataLoaderIter` class and redefine `inputs_labels_from_batch` method so that it outputs (inputs, labels). val_loader (`torch.utils.data.DataLoader` or child of `ValDataLoaderIter`, optional): if `None` the range test will only use the training loss. When given a data loader, the model is evaluated after each iteration on that dataset and the evaluation loss is used. Note that in this mode the test takes significantly longer but generally produces more precise results. Similarly to `train_loader`, if your dataset outputs are not standard you should inherit from `ValDataLoaderIter` class and redefine method `inputs_labels_from_batch` so that it outputs (inputs, labels). Default: None. start_lr (float, optional): the starting learning rate for the range test. Default: None (uses the learning rate from the optimizer). end_lr (float, optional): the maximum learning rate to test. Default: 10. num_iter (int, optional): the number of iterations over which the test occurs. Default: 100. step_mode (str, optional): one of the available learning rate policies, linear or exponential ("linear", "exp"). Default: "exp". smooth_f (float, optional): the loss smoothing factor within the [0, 1[ interval. Disabled if set to 0, otherwise the loss is smoothed using exponential smoothing. Default: 0.05. diverge_th (int, optional): the test is stopped when the loss surpasses the threshold: diverge_th * best_loss. Default: 5. accumulation_steps (int, optional): steps for gradient accumulation. If it is 1, gradients are not accumulated. Default: 1. non_blocking_transfer (bool, optional): when non_blocking_transfer is set, tries to convert/move data to the device asynchronously if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. Default: True. Example (fastai approach): >>> lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> lr_finder.range_test(dataloader, end_lr=100, num_iter=100) Example (Leslie Smith's approach): >>> lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> lr_finder.range_test(trainloader, val_loader=val_loader, end_lr=1, num_iter=100, step_mode="linear") Gradient accumulation is supported; example: >>> train_data = ... # prepared dataset >>> desired_bs, real_bs = 32, 4 # batch size >>> accumulation_steps = desired_bs // real_bs # required steps for accumulation >>> dataloader = torch.utils.data.DataLoader(train_data, batch_size=real_bs, shuffle=True) >>> acc_lr_finder = LRFinder(net, optimizer, criterion, device="cuda") >>> acc_lr_finder.range_test(dataloader, end_lr=10, num_iter=100, accumulation_steps=accumulation_steps) If your DataLoader returns e.g. dict, or other non standard output, intehit from TrainDataLoaderIter, redefine method `inputs_labels_from_batch` so that it outputs (inputs, lables) data: >>> import torch_lr_finder >>> class TrainIter(torch_lr_finder.TrainDataLoaderIter): >>> def inputs_labels_from_batch(self, batch_data): >>> return (batch_data['user_features'], batch_data['user_history']), batch_data['y_labels'] >>> train_data_iter = TrainIter(train_dl) >>> finder = torch_lr_finder.LRFinder(model, optimizer, partial(model._train_loss, need_one_hot=False)) >>> finder.range_test(train_data_iter, end_lr=10, num_iter=300, diverge_th=10) Reference: [Training Neural Nets on Larger Batches: Practical Tips for 1-GPU, Multi-GPU & Distributed setups]( https://medium.com/huggingface/ec88c3e51255) [thomwolf/gradient_accumulation](https://gist.github.com/thomwolf/ac7a7da6b1888c2eeac8ac8b9b05d3d3) """ # Reset test results self.history = {"lr": [], "acc": []} self.best_loss = None self.best_acc = None # Move the model to the proper device self.model.to(self.device) # Check if the optimizer is already attached to a scheduler self._check_for_scheduler() # Set the starting learning rate if start_lr: self._set_learning_rate(start_lr) # Initialize the proper learning rate policy if step_mode.lower() == "exp": lr_schedule = ExponentialLR(self.optimizer, end_lr, num_iter) elif step_mode.lower() == "linear": lr_schedule = LinearLR(self.optimizer, end_lr, num_iter) else: raise ValueError( "expected one of (exp, linear), got {}".format(step_mode)) if smooth_f < 0 or smooth_f >= 1: raise ValueError("smooth_f is outside the range [0, 1[") # Create an iterator to get data batch by batch if isinstance(train_loader, DataLoader): train_iter = TrainDataLoaderIter(train_loader) elif isinstance(train_loader, TrainDataLoaderIter): train_iter = train_loader else: raise ValueError("`train_loader` has unsupported type: {}." "Expected types are `torch.utils.data.DataLoader`" "or child of `TrainDataLoaderIter`.".format( type(train_loader))) if val_loader: if isinstance(val_loader, DataLoader): val_iter = ValDataLoaderIter(val_loader) elif isinstance(val_loader, ValDataLoaderIter): val_iter = val_loader else: raise ValueError( "`val_loader` has unsupported type: {}." "Expected types are `torch.utils.data.DataLoader`" "or child of `ValDataLoaderIter`.".format( type(val_loader))) for iteration in tqdm(range(num_iter)): # Train on batch and retrieve accuracy acc = self._train_batch( train_iter, accumulation_steps, non_blocking_transfer=non_blocking_transfer, ) if val_loader: acc = self._validate( val_iter, non_blocking_transfer=non_blocking_transfer) # Update the learning rate self.history["lr"].append(lr_schedule.get_lr()[0]) lr_schedule.step() # Track the best accuracy and smooth it if smooth_f is specified if iteration == 0: self.best_acc = acc else: if smooth_f > 0: acc = smooth_f * acc + (1 - smooth_f) * self.history["acc"][-1] if acc > self.best_acc: self.best_acc = acc # Check if the loss has diverged; if it has, stop the test self.history["acc"].append(acc) #if loss > diverge_th * self.best_loss: #print("Stopping early, the loss has diverged") #break print( "Learning rate search finished. See the graph with {finder_name}.plot()" )
def train_encoder_model(encoder, train_dl, test_dl, Y, Y_test, epochs=config.num_enc_epochs): epochs_losses = [] criterion = nn.MSELoss() optimizer = optim.Adam(lr=1e-3, params=encoder.parameters()) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) scaler = torch.cuda.amp.GradScaler(enabled=True) Y = torch.from_numpy(Y).float().cuda() Y_test = torch.from_numpy(Y_test).float().cuda() epochs_losses_test = [] for epoch in tqdm(range(epochs)): losses = [] test_losses = [] encoder.train() for i, batch in enumerate(train_dl, 0): inputs, mris, idxs = batch inputs = einops.rearrange(inputs, 'b h w c -> b c h w') # inputs.cuda() Y_batch = mris.float().cuda() with torch.cuda.amp.autocast(): outputs = encoder(inputs.float().cuda()).float().cuda() # print(len(inputs), len(idxs)) # print(outputs.size()) # print(outputs) # print(Y_batch) loss = criterion(outputs, Y_batch) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() # scheduler.step() optimizer.zero_grad() losses.append(loss.item()) encoder.eval() with torch.no_grad(): for i, batch in enumerate(test_dl): inputs, mris, idxs = batch inputs = einops.rearrange(inputs, 'b h w c -> b c h w') Y_test_batch = mris.float().cuda() with torch.cuda.amp.autocast(): outputs = encoder(inputs.float().cuda()).float().cuda() loss = criterion(outputs, Y_test_batch) test_losses.append(loss.item()) print( f'[{epoch+1}, loss: {np.mean(losses):.4f} test loss: {np.mean(test_losses):.4f}' ) epochs_losses.append(np.mean(losses)) epochs_losses_test.append(np.mean(test_losses)) losses = [] test_losses = [] # import matplotlib.pyplot as plt # plt.plot(range(epochs), epochs_losses, label='train') # plt.plot(range(epochs), epochs_losses_test, label='test') # plt.legend() # plt.show() return encoder
def train(model: nn.Module, scheduler, optimizer, images, datasets, n_epoches: int, batch_size: int, eval_valid_freq: int = 1, eval_test_freq: int = 3, device=None): log_dir = inc_folder_no(ROOT_DIR / "runs" / "s04_exp" / "run_") writer = SummaryWriter(log_dir=log_dir) global_step = 0 model.train() valid_res = {'loss': float('nan'), 'accuracy': float('nan')} test_res = {'loss': float('nan'), 'accuracy': float('nan')} best_performance = 0.0 train_examples, valid_examples, test_examples = datasets try: with tqdm(range(n_epoches), desc='epoch') as epoches, tqdm(total=math.ceil( sum([len(x) for x in train_examples]) / batch_size) * n_epoches, desc='training') as pbar: for i in epoches: scheduler.step() for bimgs, bx, bnts, bxlen, bnt, bnbtn, bnrow in iter_batch( batch_size, images, train_examples, shuffle=True, device=device): pbar.update() global_step += 1 model.zero_grad() bnts_pred = model(bimgs, bx, bxlen) loss, mask, btokens = padded_aware_nllloss(bnts_pred, bnts) accuracy = ((torch.argmax(bnts_pred, dim=1) == bnts.view(-1)).float() * mask).sum().item() / btokens loss.backward() optimizer.step() writer.add_scalar('train/loss', loss, global_step) writer.add_scalar('train/accuracy', accuracy, global_step) pbar.set_postfix(loss=f"{loss:.5f}", accuracy=f"{accuracy:.5f}") if (i + 1) % eval_valid_freq == 0: valid_res = eval(model, images, valid_examples, device) for k, v in valid_res.items(): writer.add_scalar(f'valid/{k}', v, global_step) if valid_res['accuracy'] > best_performance: best_performance = valid_res['accuracy'] torch.save(model, log_dir + f"/model.{i}.bin") if (i + 1) % eval_test_freq == 0: test_res = eval(model, images, test_examples, device) for k, v in test_res.items(): writer.add_scalar(f'test/{k}', v, global_step) epoches.set_postfix( v_l=f'{valid_res["loss"]:.5f}', v_a=f'{valid_res["accuracy"]:.5f}', t_l=f'{test_res["loss"]:.5f}', t_a=f'{test_res["accuracy"]:.5f}', ) finally: writer.close()
def encode(self, sentences: Union[str, List[str], List[int]], batch_size: int = 32, show_progress_bar: bool = None, output_value: str = 'sentence_embedding', convert_to_numpy: bool = True, convert_to_tensor: bool = False, is_pretokenized: bool = False, device: str = None, num_workers: int = 0) -> Union[List[Tensor], ndarray, Tensor]: """ Computes sentence embeddings :param sentences: the sentences to embed :param batch_size: the batch size used for the computation :param show_progress_bar: Output a progress bar when encode sentences :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy :param is_pretokenized: DEPRECATED - No longer used :param device: Which torch.device to use for the computation :param num_workers: DEPRECATED - No longer used :return: By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. """ self.eval() if show_progress_bar is None: show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) if convert_to_tensor: convert_to_numpy = False input_was_string = False if isinstance( sentences, str): #Cast an individual sentence to a list with length 1 sentences = [sentences] input_was_string = True if device is None: device = self._target_device self.to(device) all_embeddings = [] length_sorted_idx = np.argsort( [self._text_length(sen) for sen in sentences]) sentences_sorted = [sentences[idx] for idx in length_sorted_idx] iterator = range(0, len(sentences), batch_size) if show_progress_bar: iterator = tqdm(iterator, desc="Batches") for start_index in iterator: sentences_batch = sentences_sorted[start_index:start_index + batch_size] features = self.tokenize(sentences_batch) features = batch_to_device(features, device) with torch.no_grad(): out_features = self.forward(features) embeddings = out_features[output_value] if output_value == 'token_embeddings': #Set token embeddings to 0 for padding tokens input_mask = out_features['attention_mask'] input_mask_expanded = input_mask.unsqueeze(-1).expand( embeddings.size()).float() embeddings = embeddings * input_mask_expanded embeddings = embeddings.detach() # fixes for #522 and #487 to avoid oom problems on gpu with large datasets if convert_to_numpy: embeddings = embeddings.cpu() all_embeddings.extend(embeddings) all_embeddings = [ all_embeddings[idx] for idx in np.argsort(length_sorted_idx) ] if convert_to_tensor: all_embeddings = torch.stack(all_embeddings) elif convert_to_numpy: all_embeddings = np.asarray( [emb.numpy() for emb in all_embeddings]) if input_was_string: all_embeddings = all_embeddings[0] return all_embeddings
def do_work(collection_name, log_verbose, slurm=True, unobserved=False, post_mortem=False, num_exps=-1, filter_dict={}, dry_run=False): """Pull queued experiments from the database and run them. Parameters ---------- collection_name: str Name of the collection in the MongoDB. log_verbose: bool Print all the Python syscalls before running them. slurm: bool Use the Slurm cluster. unobserved: bool Disable all Sacred observers (nothing written to MongoDB). post_mortem: bool Activate post-mortem debugging. num_exps: int, default: -1 If >0, will only submit the specified number of experiments to the cluster. This is useful when you only want to test your setup. filter_dict: dict Dictionary for filtering the entries in the collection. dry_run: bool Just return the executables and configurations instead of running them. Returns ------- None """ collection = db_utils.get_collection(collection_name) query_dict = {'status': {"$in": ['QUEUED']}} query_dict.update(filter_dict) if collection.count_documents(query_dict) <= 0: print("No queued experiments.") return exps_list = list(collection.find(query_dict)) nexps = num_exps if num_exps > 0 else len(exps_list) exp_chunks = db_utils.chunk_list(exps_list[:nexps]) njobs = len(exp_chunks) if dry_run: configs = [] for exps in exp_chunks: for exp in exps: configs.append( get_config_from_exp(exp, log_verbose=log_verbose, unobserved=unobserved, post_mortem=post_mortem)) return configs elif slurm: print(f"Starting {nexps} experiment{s_if(nexps)} in " f"{njobs} Slurm job{s_if(njobs)}.") for exps in tqdm(exp_chunks): slurm_config = exps[0]['slurm'] seml_config = exps[0]['seml'] if 'output_dir' in slurm_config: warnings.warn( "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly" "by moving the 'output_dir' parameter from 'slurm' to 'seml'." ) elif 'output_dir' in seml_config: slurm_config['output_dir'] = seml_config['output_dir'] del slurm_config['experiments_per_job'] start_slurm_job(collection, exps, log_verbose, unobserved, post_mortem, **slurm_config) else: login_node_name = 'fs' if login_node_name in os.uname()[1]: raise ValueError( "Refusing to run a compute experiment on a login node. " "Please use Slurm or a compute node.") print( f'Starting local worker thread that will run up to {nexps} experiments, ' f'until no queued experiments remain.') collection.update_many(query_dict, {"$set": {"status": "PENDING"}}) num_exceptions = 0 i_exp = 0 tq = tqdm(exp_chunks) for exps in tq: for exp in exps: exe, config = get_config_from_exp(exp, log_verbose=log_verbose, unobserved=unobserved, post_mortem=post_mortem) cmd = f"python {exe} with {' '.join(config)}" if not unobserved: # check also whether PENDING experiments have their Slurm ID set, in this case they are waiting # for Slurm execution and we don't start them locally. db_entry = collection.find_one_and_update( filter={ '_id': exp['_id'], 'status': 'PENDING', 'slurm.id': { '$exists': False } }, update={ '$set': { 'seml.command': cmd, 'status': 'RUNNING' } }, upsert=False) if db_entry is None: # another worker already set this entry to PENDING (or at least, it's no longer QUEUED) # so we ignore it. continue if log_verbose: print(f'Running the following command:\n {cmd}') try: output_dir = "." slurm_config = exps[0]['slurm'] seml_config = exps[0]['seml'] if 'output_dir' in slurm_config: warnings.warn( "'output_dir' has moved from 'slurm' to 'seml'. Please adapt your YAML accordingly" "by moving the 'output_dir' parameter from 'slurm' to 'seml'." ) output_dir = slurm_config['output_dir'] if 'output_dir' in seml_config: output_dir = seml_config['output_dir'] output_dir_path = os.path.abspath( os.path.expanduser(output_dir)) exp_name = slurm_config['name'] output_file = f"{output_dir_path}/{exp_name}_{exp['_id']}-out.txt" collection.find_and_modify( {'_id': exp['_id']}, {"$set": { "seml.output_file": output_file }}) with open(output_file, "w") as log_file: # pdb works with check_call but not with check_output. Maybe because of stdout/stdin. subprocess.check_call( cmd, shell=True, stderr=log_file, stdout=log_file, ) except subprocess.CalledProcessError as e: num_exceptions += 1 except IOError: print(f"Log file {output_file} could not be written.") # Since Sacred is never called in case of I/O error, we need to set the experiment state manually. collection.find_one_and_update( filter={'_id': exp['_id']}, update={'$set': { 'status': 'FAILED' }}, upsert=False) finally: i_exp += 1 tq.set_postfix( failed=f"{num_exceptions}/{i_exp} experiments")