def sign_of_life(cls, job, num_already_present_imgs, outdir, start_time, force_rewrite=False): # Time for sign of life? now_time = datetime.datetime.now() time_duration = now_time - start_time # Every 3 seconds, but at least 3: if force_rewrite \ or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0): # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(time_duration, granularity=4) # Get current and new spectro imgs in outdir: num_now_present_imgs = len( Utils.find_in_dir_tree(outdir, pattern="*.png")) num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs # Keep printing number of done snippets in the same # terminal line: print((f"{job.name}---Number of spectros: {num_now_present_imgs} " f"({num_newly_present_imgs} new) after {duration_str}"), end='\r') return num_newly_present_imgs else: return num_already_present_imgs
def validate_split(self, step): ''' Validate one split, using that split's validation fold. Return time taken. Record results for tensorboard and other record keeping. :param step: current combination of epoch and split :type step: int :return: number of epoch seconds needed for the validation :rtype: int ''' # Validation self.log.debug( f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) start_time = datetime.datetime.now() self.log.info(f"Starting validation for step {step}") self.model.eval() with torch.no_grad(): for img_tensor, target in self.train_loader.validation_samples(): expanded_img_tensor = unsqueeze(img_tensor, dim=0) expanded_target = unsqueeze(target, dim=0) # Update sanity record: self.class_coverage[int(target)]['val'] += 1 images = FileUtils.to_device(expanded_img_tensor, 'gpu') label = FileUtils.to_device(expanded_target, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, label) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') label = FileUtils.to_device(label, 'cpu') loss = FileUtils.to_device(loss, 'cpu') self.remember_results(LearningPhase.VALIDATING, step, outputs, label, loss) del images del outputs del label del loss torch.cuda.empty_cache() end_time = datetime.datetime.now() val_time_duration = end_time - start_time # A human readable duration st down to minues: duration_str = FileUtils.time_delta_str(val_time_duration, granularity=4) self.log.info(f"Done validation (duration: {duration_str})") return val_time_duration
def train(self): overall_start_time = datetime.datetime.now() # Just for sanity: keep track # of number of batches... total_batch_num = 0 # Note: since we are cross validating, the # data loader's set_epoch() method is only # called once (automatically) during instantiation # of the associated sampler. Moving from split # to split includes shuffling if the caller # specified that. # Training for split_num in range(self.train_loader.num_folds): split_start_time = datetime.datetime.now() self.initialize_model() for epoch in range(self.max_epochs): # Set model to train mode: self.model.train() epoch_start_time = datetime.datetime.now() self.log.info(f"Starting epoch {epoch} training") # Sanity check record: will record # how many samples from each class were # used: self.class_coverage = {} # Sanity records: will record number # of samples of each class that are used # during training and validation: label_distrib = {} batch_num = 0 self.log.info( f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}" ) try: for batch, targets in self.train_loader: # Update the sanity check # num of batches seen, and distribution # of samples across classes: batch_num += 1 total_batch_num += 1 # Update sanity check records: for lbl in targets: lbl = int(lbl) try: label_distrib[lbl] += 1 except KeyError: label_distrib[lbl] = 1 try: self.class_coverage[lbl]['train'] += 1 except KeyError: self.class_coverage[lbl] = { 'train': 1, 'val': 0 } self.log.debug( f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Remember the last batch's train result of this # split (results for earlier batches of # the same split will be overwritten). This statement # must sit before deleting output and labels: step_num = self.step_number(epoch, split_num, self.num_folds) self.remember_results(LearningPhase.TRAINING, step_num, outputs, labels, loss) self.log.debug( f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') del images del outputs del labels del loss torch.cuda.empty_cache() self.log.debug( f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) except EndOfSplit: end_time = datetime.datetime.now() train_time_duration = end_time - epoch_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str( train_time_duration, granularity=4) self.log.info( f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})" ) #*********** #print(f"****** num_batches in split: {batch_num}" ) #print(f"****** LblDist: {label_distrib}") #*********** self.validate_split(step_num) self.visualize_step(step_num) # Save model, keeping self.model_archive_size models: self.model_archive.save_model(self.model, epoch) self.log.debug( f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) # Next Epoch continue end_time = datetime.datetime.now() train_time_duration = end_time - split_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(train_time_duration, granularity=4) self.log.info( f"Done training split {split_num} (duration: {duration_str})") # Next split continue end_time = datetime.datetime.now() epoch_duration = end_time - epoch_start_time epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4) cumulative_dur = end_time - overall_start_time cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4) msg = f"Done epoch {epoch} (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})" self.log.info(msg) #******self.scheduler.step() # Fresh results tallying #self.results.clear() self.log.info( f"Training complete after {self.train_loader.num_folds} splits") # Report the sanity checks: self.log.info(f"Total batches processed: {total_batch_num}") for cid in self.class_coverage.keys(): train_use, val_use = self.class_coverage[cid].items() self.log.info( f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}" ) # All seems to have gone well. Report the # overall result of the final epoch for the # hparms config used in this process: self.report_hparams_summary(self.latest_result) # The final epoch number: return epoch
def run_inference(self, gpu_to_use=0): ''' Runs model over dataloader. Along the way: creates ResultTally for each batch, and maintains dict instance variable self.raw_results for later conversion of logits to class IDs under different threshold assumptions. self.raw_results: {'all_outputs' : <arr>, 'all_labels' : <arr> } Returns a ResultCollection with the ResultTally instances of each batch. :param gpu_to_use: which GPU to deploy to (if it is available) :type gpu_to_use: int :return: collection of tallies, one for each batch, or None if something went wrong. :rtype: {None | ResultCollection} ''' # Just in case the loop never runs: batch_num = -1 overall_start_time = datetime.datetime.now() try: try: if torch.cuda.is_available(): self.model.load_state_dict(torch.load(self.model_path)) FileUtils.to_device(self.model, 'gpu', gpu_to_use) else: self.model.load_state_dict( torch.load(self.model_path, map_location=torch.device('cpu'))) except RuntimeError as e: emsg = repr(e) if emsg.find("size mismatch for conv1") > -1: emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?" raise RuntimeError(emsg) from e loss_fn = nn.CrossEntropyLoss() result_coll = ResultCollection() # Save all per-class logits for ability # later to use different thresholds for # conversion to class IDs: all_outputs = [] all_labels = [] self.model.eval() num_test_samples = len(self.loader.dataset) self.log.info( f"Begin inference ({num_test_samples} test samples)...") samples_processed = 0 loop_start_time = overall_start_time with torch.no_grad(): for batch_num, (batch, targets) in enumerate(self.loader): if torch.cuda.is_available(): images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') else: images = batch labels = targets outputs = self.model(images) loss = loss_fn(outputs, labels) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') #********** max_logit = outputs[0].max().item() max_idx = (outputs.squeeze() == max_logit).nonzero( as_tuple=False).item() smpl_id = torch.utils.data.dataloader.sample_id_seq[-1] lbl = labels[0].item() pred_cl = max_idx self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl) #********** # Specify the batch_num in place # of an epoch, which is not applicatble # during testing: tally = ResultTally(batch_num, LearningPhase.TESTING, outputs, labels, loss, self.num_classes, self.batch_size) result_coll.add(tally, step=None) all_outputs.append(outputs) all_labels.append(labels) samples_processed += len(labels) del images del outputs del labels del loss torch.cuda.empty_cache() time_now = datetime.datetime.now() # Sign of life every 6 seconds: if (time_now - loop_start_time).seconds >= 5: self.log.info( f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples" ) loop_start_time = time_now finally: #********* print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}") torch.utils.data.dataloader.sample_id_seq = [] #********* time_now = datetime.datetime.now() test_time_duration = time_now - overall_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(test_time_duration, granularity=4) self.log.info( f"Done with inference: {samples_processed} test samples; {duration_str}" ) # Total number of batches we ran: num_batches = 1 + batch_num # b/c of zero-base # If loader delivered nothing, the loop # never ran; warn, and get out: if num_batches == 0: self.log.warn( f"Dataloader delivered no data from {self.samples_path}") self.close() return None # Var all_outputs is now: # [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0 # tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1 # ... # ] # Make into one tensor: (num_batches, batch_size, num_classes), # unless an exception was raised at some point, # throwing us into this finally clause: if len(all_outputs) == 0: self.log.info( f"No outputs were produced; thus no results to report") return None self.all_outputs_tn = torch.stack(all_outputs) # Be afraid...be very afraid: assert(self.all_outputs_tn.shape == \ torch.Size([num_batches, self.batch_size, self.num_classes]) ) # Var all_labels is now num-batches tensors, # each containing batch_size labels: assert (len(all_labels) == num_batches) # list of single-number tensors. Make # into one tensor: self.all_labels_tn = torch.stack(all_labels) assert(self.all_labels_tn.shape == \ torch.Size([num_batches, self.batch_size]) ) # And equivalently: assert(self.all_labels_tn.shape == \ (self.all_outputs_tn.shape[0], self.all_outputs_tn.shape[1] ) ) self.report_results(result_coll) self.close() return result_coll