Exemplo n.º 1
0
    def __init__(self,
                 model_paths,
                 samples_path,
                 batch_size=1,
                 labels_path=None,
                 gpu_ids=0):
        '''
        Given the path to a trained model,
        and the path to the root of a set
        of data, compute predictions.
        
        If labels_path is None, the subdirectory
        names between the samples_path root,
        and the samples themselves are used as
        the ground truth labels.
        
        By default: run batches of size 1,
        because we always have drop_last set
        to True. For small test sets leaving
        out any data at all isn't good. Caller
        can still set batch_size higher to gain
        speed if the testset is very large, so that
        not inferencing on up to batch_size - 1 
        samples is OK
        
        :param model_paths:
        :type model_paths:
        :param samples_path:
        :type samples_path:
        :param batch_size:
        :type batch_size:
        :param labels_path:
        :type labels_path:
        :param gpu_ids: Device number of GPU, in case 
            one is available
        :type gpu_ids: {int | [int]} 
        '''

        self.model_paths = model_paths
        self.samples_path = samples_path
        self.labels_path = labels_path
        self.gpu_ids = gpu_ids if type(gpu_ids) == list else [gpu_ids]
        if batch_size is not None:
            self.batch_size = batch_size
        else:
            self.batch_size = 1

        self.IMG_EXTENSIONS = FileUtils.IMG_EXTENSIONS
        self.log = LoggingService()
        self.curr_dir = os.path.dirname(__file__)
Exemplo n.º 2
0
    def get_weights(cls, file_root):
        '''
        Given to root of a subdirectory,
        return a tensor of weights. The order
        of the weights corresponds to the 
        naturally sorted class names.
        
        :param file_root: full path to root
            of data file subtree
        :type file_root: str
        :return weights in naturally sorted class order
        :rtype: Tensor
        '''

        
        # Full paths of all the non-dot-starting 
        # dirs under file_root:

        #   OrderedDict{class_name : [Path(dir1), Path(dir2)]
        # The class names are already sorted:
        class_name_paths_dir = FileUtils.find_class_paths(file_root)
        
        # Create:
        #  {'class1' : <num_samples>,
        #   'class2' : <num_samples>,
        #         ...
        #   }
        
        class_populations = {}
        for class_name in class_name_paths_dir.keys():
            num_samples = 0
            # Each class may have samples in multiple
            # directories; add them up:
            for class_dir in class_name_paths_dir[class_name]:
                num_samples += len([file_name 
                                     for file_name 
                                     in os.listdir(class_dir)
                                     if Path(file_name).suffix in FileUtils.IMG_EXTENSIONS
                                     ])
            class_populations[class_name] = num_samples
            
        if len(class_populations) == 0:
            LoggingService().err(f"No target classes found under {file_root}")
            sys.exit(1)
        majority_class_population = max(class_populations.values())
        weights = []
        for class_name in class_name_paths_dir.keys():
            weights.append(class_populations[class_name] / majority_class_population)

        return torch.tensor(weights) 
Exemplo n.º 3
0
    def __init__(self, unittesting=False):

        self.hostname = socket.getfqdn()
        if unittesting:
            # Let unittests create an instance
            # and call individual methods:
            return

        # Logging to console during launch:
        self.log = LoggingService()

        # Convenience: directory of this
        # script, and project root directory
        curr_dir = Path(__file__).parent
        proj_root = curr_dir.joinpath('../..').resolve()
        self.curr_dir = str(curr_dir)
        self.proj_root = str(proj_root)

        args_parser = BirdsTrainingArgumentsParser(
            formatter_class=BirdsTrainingArgumentsParser.
            BlankLinesHelpFormatter,
            description="PyTorch distributed training launch "
            "helper to spawn multiple distributed "
            "birds_train_parallel.py processes")

        all_args = args_parser.parse_args()
        # Separate the args for this launch script
        # from the args destined for the copies of
        # the train script:
        self.launch_args = all_args['launch_args']
        self.script_args = all_args['script_args']

        # Build the gpu_landscape dict:
        self.gather_world_layout(self.launch_args)

        self.GPUS_USED_THIS_MACHINE = self.gpu_landscape[
            self.hostname]['num_gpus']
Exemplo n.º 4
0
    def __init__(self, gpu_ids, log=None):
        '''
        ****
        
        :param gpu_ids: ids of GPUs on this machine
            that may be used
        :type gpu_ids: [int]
        '''

        if GPUManager.__is_initialized:
            return
        else:
            GPUManager.__is_initialized = True

        if len(gpu_ids) == 0:
            self.cpu_only = True
        else:
            self.cpu_only = False

        #**************
        # No SIGSEGV or SIGABRT yet:
        self.hardware_error = False
        stacktrace_fd = open(
            os.path.join(os.path.dirname(__file__), 'seg_abrt.log'), 'w')
        faulthandler.enable(stacktrace_fd)
        #**************

        self.log = LoggingService() if log is None else log
        self.gpus_available = len(gpu_ids)
        self.gpu_ids = gpu_ids
        self.who_is_who = {}
        self.lock = Lock()

        # Callback for psutil.wait_proc() to
        # call when a process finishes. The
        # currying is to get 'self' passed to the
        # method, along with the finished process:

        self.proc_finished_callback = partial(self.proc_termination_callback,
                                              self)
Exemplo n.º 5
0
    def test_bad_wav_file(self):

        with tempfile.TemporaryDirectory(dir='/tmp', 
                                         prefix='test_spectro') as in_dir:
    
            with tempfile.TemporaryDirectory(dir='/tmp', 
                                             prefix='test_spectro') as out_dir:
    
    
                log_file = os.path.join(out_dir, 'err_log.txt')
                SpectrogramCreator.log = LoggingService()
                SpectrogramCreator.log.log_file=log_file
    
                bad_species_path = os.path.join(in_dir, 'BADBIRD')
                os.mkdir(bad_species_path)
                bad_bird_fname   = 'bad_audio.wav'
                assignments = ([('BADBIRD', bad_bird_fname)])
                bad_bird_path = os.path.join(bad_species_path, bad_bird_fname)
                # Create a 0-length file:
                Path(bad_bird_path).touch()
                
                ret_value_slot = mp.Value("b", False)
                
                # Ensure that an error is logged, though
                # none is raised:

                SpectrogramCreator.create_from_file_list(
                    assignments,
                    in_dir,
                    out_dir,
                    WhenAlreadyDone.OVERWRITE,
                    ret_value_slot,
                    env=None)
                # Read the log file:
                with open(log_file, 'r') as fd:
                    log_entry = fd.read()
                
                # The log msg should include:
                # "ERROR: One file could not be processed ... AudioLoadException('Audio file to load is empty ..."
                self.assertTrue(log_entry.find('to load is empty') > -1)
Exemplo n.º 6
0
    def __init__(self, config_info, debugging=False):
        '''
        Constructor
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        # No cross validation:
        self.folds = 0
        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)
        self.train_loader, self.val_loader = self.get_dataloader(
            sample_width, sample_height)
        self.class_names = self.train_loader.dataset.classes

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here (two per epoch, for
        # for all training loop runs, and one
        # for all val loop runs:

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_epoch = self.train()
            self.visualize_final_epoch_results(final_epoch)
        finally:
            self.close_tensorboard()
Exemplo n.º 7
0
class Inferencer:
    '''
    classdocs
    '''

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 model_paths,
                 samples_path,
                 batch_size=1,
                 labels_path=None,
                 gpu_ids=0):
        '''
        Given the path to a trained model,
        and the path to the root of a set
        of data, compute predictions.
        
        If labels_path is None, the subdirectory
        names between the samples_path root,
        and the samples themselves are used as
        the ground truth labels.
        
        By default: run batches of size 1,
        because we always have drop_last set
        to True. For small test sets leaving
        out any data at all isn't good. Caller
        can still set batch_size higher to gain
        speed if the testset is very large, so that
        not inferencing on up to batch_size - 1 
        samples is OK
        
        :param model_paths:
        :type model_paths:
        :param samples_path:
        :type samples_path:
        :param batch_size:
        :type batch_size:
        :param labels_path:
        :type labels_path:
        :param gpu_ids: Device number of GPU, in case 
            one is available
        :type gpu_ids: {int | [int]} 
        '''

        self.model_paths = model_paths
        self.samples_path = samples_path
        self.labels_path = labels_path
        self.gpu_ids = gpu_ids if type(gpu_ids) == list else [gpu_ids]
        if batch_size is not None:
            self.batch_size = batch_size
        else:
            self.batch_size = 1

        self.IMG_EXTENSIONS = FileUtils.IMG_EXTENSIONS
        self.log = LoggingService()
        self.curr_dir = os.path.dirname(__file__)

    #------------------------------------
    # prep_model_inference
    #-------------------

    def prep_model_inference(self, model_path):
        '''
        1. Parses model_path into its components, and 
            creates a dict: self.model_props, which 
            contains the network type, grayscale or not,
            whether pretrained, etc.
        2. Creates self.csv_writer to write results measures
            into csv files. The destination file is determined
            as follows:
                <script_dir>/runs_raw_inferences/inf_csv_results_<datetime>/<model-props-derived-fname>.csv
        3. Creates self.writer(), a tensorboard writer with destination dir:
                <script_dir>/runs_inferences/inf_results_<datetime>
        4. Creates an ImageFolder classed dataset to self.samples_path
        5. Creates a shuffling DataLoader
        6. Initializes self.num_classes and self.class_names
        7. Creates self.model from the passed-in model_path name
        
        :param model_path: path to model that will be used for
            inference by this instance of Inferencer
        :type model_path: str
        '''

        model_fname = os.path.basename(model_path)

        # Extract model properties
        # from the model filename:
        self.model_props = FileUtils.parse_filename(model_fname)

        csv_results_root = os.path.join(self.curr_dir, 'runs_raw_inferences')
        #self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{uuid.uuid4().hex}")
        ts = FileUtils.file_timestamp()
        self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{ts}")
        os.makedirs(self.csv_dir, exist_ok=True)

        csv_file_nm = FileUtils.construct_filename(self.model_props,
                                                   prefix='inf',
                                                   suffix='.csv',
                                                   incl_date=True)
        csv_path = os.path.join(self.csv_dir, csv_file_nm)

        self.csv_writer = CSVWriterCloseable(csv_path)

        ts = FileUtils.file_timestamp()
        tensorboard_root = os.path.join(self.curr_dir, 'runs_inferences')
        tensorboard_dest = os.path.join(tensorboard_root, f"inf_results_{ts}")
        #f"inf_results_{ts}{uuid.uuid4().hex}")
        os.makedirs(tensorboard_dest, exist_ok=True)

        self.writer = SummaryWriterPlus(log_dir=tensorboard_dest)

        dataset = SingleRootImageDataset(
            self.samples_path, to_grayscale=self.model_props['to_grayscale'])

        # Make reproducible:
        Utils.set_seed(42)
        #********Utils.set_seed(56)
        self.loader = DataLoader(dataset,
                                 batch_size=self.batch_size,
                                 shuffle=True,
                                 drop_last=True)
        self.class_names = dataset.class_names()
        self.num_classes = len(self.class_names)

        # Get the right type of model,
        # Don't bother getting it pretrained,
        # of freezing it, b/c we will overwrite
        # the weights:

        self.model = NetUtils.get_net(
            self.model_props['net_name'],
            num_classes=self.num_classes,
            pretrained=False,
            freeze=0,
            to_grayscale=self.model_props['to_grayscale'])

        self.log.info(f"Tensorboard info written to {tensorboard_dest}")
        self.log.info(f"Result measurement CSV file(s) written to {csv_path}")

    #------------------------------------
    # __call__
    #-------------------

    def __call__(self, gpu_id_model_path_pair):
        gpu_id, self.model_path = gpu_id_model_path_pair
        self.prep_model_inference(self.model_path)
        self.log.info(
            f"Begining inference with model {FileUtils.ellipsed_file_path(self.model_path)} on gpu_id {gpu_id}"
        )
        #****************
        #return self.run_inference(gpu_to_use=gpu_id)
        dicts_from_runs = []
        for i in range(3):
            self.curr_dict = {}
            dicts_from_runs.append(self.curr_dict)
            self.run_inference(gpu_to_use=gpu_id)
        print(dicts_from_runs)
        #****************

    #------------------------------------
    # go
    #-------------------

    def go(self):
        # Pair models to GPUs; example for
        # self.gpu_ids == [0,4], and three models:
        #    [(gpu0, model0) (gpu4, model1), (gpu0, model3)]

        repeats = int(np.ceil(len(self.model_paths) / len(self.gpu_ids)))
        gpu_model_pairings = list(zip(self.gpu_ids * repeats,
                                      self.model_paths))

        #************* No parallelism for debugging
        self(gpu_model_pairings[0])
        return
        #************* END No parallelism for debugging

        with Pool(len(self.gpu_ids)) as inf_pool:
            # Run as many inferences in parallel as
            # there are models to try. The first arg,
            # (self): means to invoke the __call__() method
            # on self.
            result_it = inf_pool.imap(self,
                                      gpu_model_pairings,
                                      chunksize=len(self.gpu_ids))
            results = [res.get() for res in result_it]
            print(f"******Results: {results}")

    #------------------------------------
    # run_inferencer
    #-------------------

    def run_inference(self, gpu_to_use=0):
        '''
        Runs model over dataloader. Along
        the way: creates ResultTally for each
        batch, and maintains dict instance variable
        self.raw_results for later conversion of
        logits to class IDs under different threshold
        assumptions. 
        
        self.raw_results: 
                {'all_outputs' : <arr>,
                 'all_labels'  : <arr>
                 }
        
        Returns a ResultCollection with the
        ResultTally instances of each batch.

        :param gpu_to_use: which GPU to deploy to (if it is available)
        :type gpu_to_use: int
        :return: collection of tallies, one for each batch,
            or None if something went wrong.
        :rtype: {None | ResultCollection}
        '''
        # Just in case the loop never runs:
        batch_num = -1
        overall_start_time = datetime.datetime.now()

        try:
            try:
                if torch.cuda.is_available():
                    self.model.load_state_dict(torch.load(self.model_path))
                    FileUtils.to_device(self.model, 'gpu', gpu_to_use)
                else:
                    self.model.load_state_dict(
                        torch.load(self.model_path,
                                   map_location=torch.device('cpu')))
            except RuntimeError as e:
                emsg = repr(e)
                if emsg.find("size mismatch for conv1") > -1:
                    emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?"
                    raise RuntimeError(emsg) from e

            loss_fn = nn.CrossEntropyLoss()

            result_coll = ResultCollection()

            # Save all per-class logits for ability
            # later to use different thresholds for
            # conversion to class IDs:

            all_outputs = []
            all_labels = []

            self.model.eval()
            num_test_samples = len(self.loader.dataset)
            self.log.info(
                f"Begin inference ({num_test_samples} test samples)...")

            samples_processed = 0

            loop_start_time = overall_start_time
            with torch.no_grad():

                for batch_num, (batch, targets) in enumerate(self.loader):
                    if torch.cuda.is_available():
                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')
                    else:
                        images = batch
                        labels = targets

                    outputs = self.model(images)
                    loss = loss_fn(outputs, labels)

                    images = FileUtils.to_device(images, 'cpu')
                    outputs = FileUtils.to_device(outputs, 'cpu')
                    labels = FileUtils.to_device(labels, 'cpu')
                    loss = FileUtils.to_device(loss, 'cpu')

                    #**********
                    max_logit = outputs[0].max().item()
                    max_idx = (outputs.squeeze() == max_logit).nonzero(
                        as_tuple=False).item()
                    smpl_id = torch.utils.data.dataloader.sample_id_seq[-1]
                    lbl = labels[0].item()
                    pred_cl = max_idx

                    self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl)
                    #**********

                    # Specify the batch_num in place
                    # of an epoch, which is not applicatble
                    # during testing:
                    tally = ResultTally(batch_num, LearningPhase.TESTING,
                                        outputs, labels, loss,
                                        self.num_classes, self.batch_size)
                    result_coll.add(tally, step=None)

                    all_outputs.append(outputs)
                    all_labels.append(labels)

                    samples_processed += len(labels)

                    del images
                    del outputs
                    del labels
                    del loss

                    torch.cuda.empty_cache()

                    time_now = datetime.datetime.now()
                    # Sign of life every 6 seconds:
                    if (time_now - loop_start_time).seconds >= 5:
                        self.log.info(
                            f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples"
                        )
                        loop_start_time = time_now
        finally:

            #*********
            print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}")
            torch.utils.data.dataloader.sample_id_seq = []
            #*********
            time_now = datetime.datetime.now()
            test_time_duration = time_now - overall_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(test_time_duration,
                                                    granularity=4)
            self.log.info(
                f"Done with inference: {samples_processed} test samples; {duration_str}"
            )
            # Total number of batches we ran:
            num_batches = 1 + batch_num  # b/c of zero-base

            # If loader delivered nothing, the loop
            # never ran; warn, and get out:
            if num_batches == 0:
                self.log.warn(
                    f"Dataloader delivered no data from {self.samples_path}")
                self.close()
                return None

            # Var all_outputs is now:
            #  [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0
            #   tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1
            #                     ...
            #   ]
            # Make into one tensor: (num_batches, batch_size, num_classes),
            # unless an exception was raised at some point,
            # throwing us into this finally clause:
            if len(all_outputs) == 0:
                self.log.info(
                    f"No outputs were produced; thus no results to report")
                return None

            self.all_outputs_tn = torch.stack(all_outputs)
            # Be afraid...be very afraid:
            assert(self.all_outputs_tn.shape == \
                   torch.Size([num_batches,
                               self.batch_size,
                               self.num_classes])
                   )

            # Var all_labels is now num-batches tensors,
            # each containing batch_size labels:
            assert (len(all_labels) == num_batches)

            # list of single-number tensors. Make
            # into one tensor:
            self.all_labels_tn = torch.stack(all_labels)
            assert(self.all_labels_tn.shape == \
                   torch.Size([num_batches, self.batch_size])
                   )
            # And equivalently:
            assert(self.all_labels_tn.shape == \
                   (self.all_outputs_tn.shape[0],
                    self.all_outputs_tn.shape[1]
                    )
                   )

            self.report_results(result_coll)
            self.close()

        return result_coll

    #------------------------------------
    # report_results
    #-------------------

    def report_results(self, tally_coll):
        self._report_textual_results(tally_coll, self.csv_dir)
        self._report_conf_matrix(tally_coll, show_in_tensorboard=True)
        self._report_charted_results()

    #------------------------------------
    # _report_conf_matrix
    #-------------------

    def _report_conf_matrix(self,
                            tally_coll,
                            show=True,
                            show_in_tensorboard=None):
        '''
        Computes the confusion matrix CM from tally collection.
        Creates an image from CM, and displays it via matplotlib, 
        if show arg is True. If show_in_tensorboard is a Tensorboard
        SummaryWriter instance, the figure is posted to tensorboard,
        no matter the value of the show arg.  
        
        Returns the Figure object.
        
        :param tally_coll: all ResultTally instances to be included
            in the confusion matrix
        :type tally_coll: result_tallying.ResultCollection
        :param show: whether or not to call show() on the
            confusion matrix figure, or only return the Figure instance
        :type show: bool
        :param show_in_tensorboard: whether or not to post the image
            to tensorboard
        :type show_in_tensorboard: bool
        :return: Figure instance containing confusion matrix heatmap
            with color legend.
        :rtype: matplotlib.pyplot.Figure
        '''

        all_preds = []
        all_labels = []

        for tally in tally_coll.tallies(phase=LearningPhase.TESTING):
            all_preds.extend(tally.preds)
            all_labels.extend(tally.labels)

        conf_matrix = Charter.compute_confusion_matrix(all_labels,
                                                       all_preds,
                                                       self.class_names,
                                                       normalize=True)

        # Normalization in compute_confusion_matrix() is
        # to 0-1. Turn those values into percentages:
        conf_matrix_perc = (100 * conf_matrix).astype(int)

        # Decide whether or not to write
        # confusion cell values into the cells.
        # The decision depends on how many species
        # are represented in the conf matrix; too many,
        # and having numbers in all cells is too cluttered:

        if len(self.class_names
               ) > CELL_LABELING.CONF_MATRIX_CELL_LABEL_LIMIT.value:
            write_in_fields = CELL_LABELING.DIAGONAL
        else:
            write_in_fields = CELL_LABELING.ALWAYS

        fig = Charter.fig_from_conf_matrix(
            conf_matrix_perc,
            supertitle='Confusion Matrix\n',
            subtitle='Normalized to percentages',
            write_in_fields=write_in_fields)
        if show_in_tensorboard:
            self.writer.add_figure('Inference Confusion Matrix',
                                   fig,
                                   global_step=0)

        if show:
            # Something above makes fig lose its
            # canvas manager. Add that back in:
            Utils.add_pyplot_manager_to_fig(fig)
            fig.show()
        return fig

    #------------------------------------
    # _report_charted_results
    #-------------------

    def _report_charted_results(self, thresholds=None):
        '''
        Computes and (pyplot-)shows a set of precision-recall
        curves in one plot. If precision and/or recall are 
        undefined (b/c of division by zero) for all curves, then
        returns False, else True. If no curves are defined,
        logs a warning.
        
        :param thresholds: list of cutoff thresholds
            for turning logits into class ID predictions.
            If None, the default at Charters.compute_multiclass_pr_curves()
            is used.
        :type thresholds: [float]
        :return: True if curves were computed and show. Else False
        :rtype: bool
        '''

        # Obtain a dict of CurveSpecification instances,
        # one for each class, plus the mean Average Precision
        # across all curves. The dict will be keyed
        # by class ID:

        (all_curves_info, mAP) = \
          Charter.compute_multiclass_pr_curves(
              self.all_labels_tn,
              self.all_outputs_tn,
              thresholds
              )

        # Separate out the curves without
        # ill defined prec, rec, or f1:
        well_defined_curves = list(filter(
                    lambda crv_obj: not(crv_obj['undef_prec'] or\
                                        crv_obj['undef_rec'] or\
                                        crv_obj['undef_f1']
                                        ),
                    all_curves_info.values()
                    )
            )

        if len(well_defined_curves) == 0:
            self.log.warn(
                f"For all thresholds, one or more of precision, recall or f1 are undefined. No p/r curves to show"
            )
            return False

        # Too many curves are clutter. Only
        # show the best and worst by optimal f1:
        f1_sorted = sorted(well_defined_curves,
                           key=lambda obj: obj['best_op_pt']['f1'])
        curves_to_show = {
            crv_obj['class_id']: crv_obj
            for crv_obj in (f1_sorted[0], f1_sorted[-1])
        }
        #********** Mixup with objs blurring together

        (_num_classes, fig) = \
          ClassificationPlotter.chart_pr_curves(curves_to_show)

        fig.show()
        return True

    #------------------------------------
    # _report_textual_results
    #-------------------

    def _report_textual_results(self, tally_coll, res_dir):
        '''
        Give a sequence of tallies with results
        from a series of batches, create long
        outputs, and inputs lists from all tallies
        
        Computes information retrieval type values:
             precision (macro/micro/weighted/by-class)
             recall    (macro/micro/weighted/by-class)
             f1        (macro/micro/weighted/by-class)
             acuracy
             balanced_accuracy
        
        Combines these results into a Pandas series, 
        and writes them to a csv file. That file is constructed
        from the passed-in res_dir, appended with 'ir_results.csv'.
        
        Finally, constructs Github flavored tables from the
        above results, and posts them to the 'text' tab of 
        tensorboard.
        
        Returns the results measures Series 
        
        :param tally_coll: collect of tallies from batches
        :type tally_coll: ResultCollection
        :param res_dir: directory where all .csv and other 
            result files are to be written
        :type res_dir: str
        :return results of information retrieval-like measures
        :rtype: pandas.Series
        '''

        all_preds = []
        all_labels = []

        for tally in tally_coll.tallies(phase=LearningPhase.TESTING):
            all_preds.extend(tally.preds)
            all_labels.extend(tally.labels)

        res = OrderedDict({})
        res['prec_macro'] = precision_score(all_labels,
                                            all_preds,
                                            average='macro',
                                            zero_division=0)
        res['prec_micro'] = precision_score(all_labels,
                                            all_preds,
                                            average='micro',
                                            zero_division=0)
        res['prec_weighted'] = precision_score(all_labels,
                                               all_preds,
                                               average='weighted',
                                               zero_division=0)
        res['prec_by_class'] = precision_score(all_labels,
                                               all_preds,
                                               average=None,
                                               zero_division=0)

        res['recall_macro'] = recall_score(all_labels,
                                           all_preds,
                                           average='macro',
                                           zero_division=0)
        res['recall_micro'] = recall_score(all_labels,
                                           all_preds,
                                           average='micro',
                                           zero_division=0)
        res['recall_weighted'] = recall_score(all_labels,
                                              all_preds,
                                              average='weighted',
                                              zero_division=0)
        res['recall_by_class'] = recall_score(all_labels,
                                              all_preds,
                                              average=None,
                                              zero_division=0)

        res['f1_macro'] = f1_score(all_labels,
                                   all_preds,
                                   average='macro',
                                   zero_division=0)
        res['f1_micro'] = f1_score(all_labels,
                                   all_preds,
                                   average='micro',
                                   zero_division=0)
        res['f1_weighted'] = f1_score(all_labels,
                                      all_preds,
                                      average='weighted',
                                      zero_division=0)
        res['f1_by_class'] = f1_score(all_labels,
                                      all_preds,
                                      average=None,
                                      zero_division=0)

        res['accuracy'] = accuracy_score(all_labels, all_preds)
        res['balanced_accuracy'] = balanced_accuracy_score(
            all_labels, all_preds)

        res_series = pd.Series(list(res.values()), index=list(res.keys()))

        # Write information retrieval type results
        # to a one-line .csv file, using pandas Series
        # as convenient intermediary:
        res_csv_path = os.path.join(res_dir, 'ir_results.csv')
        res_series.to_csv(res_csv_path)

        res_rnd = {}
        for meas_nm, meas_val in res.items():

            # Measure results are either floats (precision, recall, etc.),
            # or np arrays (e.g. precision-per-class). For both
            # cases, round each measure to one digit:

            res_rnd[meas_nm] = round(meas_val,1) if type(meas_val) == float \
                                                 else meas_val.round(1)

        ir_measures_skel = {
            'col_header': ['precision', 'recall', 'f1'],
            'row_labels': ['macro', 'micro', 'weighted'],
            'rows': [[
                res_rnd['prec_macro'], res_rnd['recall_macro'],
                res_rnd['f1_macro']
            ],
                     [
                         res_rnd['prec_micro'], res_rnd['recall_micro'],
                         res_rnd['f1_micro']
                     ],
                     [
                         res_rnd['prec_weighted'], res_rnd['recall_weighted'],
                         res_rnd['f1_weighted']
                     ]]
        }

        ir_per_class_rows = [[
            prec_class, recall_class, f1_class
        ] for prec_class, recall_class, f1_class in zip(
            res_rnd['prec_by_class'], res_rnd['recall_by_class'],
            res_rnd['f1_by_class'])]
        ir_by_class_skel = {
            'col_header': ['precision', 'recall', 'f1'],
            'row_labels': self.class_names,
            'rows': ir_per_class_rows
        }

        accuracy_skel = {
            'col_header': ['accuracy', 'balanced_accuracy'],
            'row_labels': ['Overall'],
            'rows': [[res_rnd['accuracy'], res_rnd['balanced_accuracy']]]
        }

        ir_measures_tbl = GithubTableMaker.make_table(ir_measures_skel,
                                                      sep_lines=False)
        ir_by_class_tbl = GithubTableMaker.make_table(ir_by_class_skel,
                                                      sep_lines=False)
        accuracy_tbl = GithubTableMaker.make_table(accuracy_skel,
                                                   sep_lines=False)

        # Write the markup tables to Tensorboard:
        self.writer.add_text('Information retrieval measures',
                             ir_measures_tbl,
                             global_step=0)
        self.writer.add_text('Per class measures',
                             ir_by_class_tbl,
                             global_step=0)
        self.writer.add_text('Accuracy', accuracy_tbl, global_step=0)

        return res_series

    #------------------------------------
    # close
    #-------------------

    def close(self):
        try:
            self.writer.close()
        except Exception as e:
            self.log.err(f"Could not close tensorboard writer: {repr(e)}")
Exemplo n.º 8
0
    def __init__(self,
                 starting_config_src,
                 hparms_spec,
                 training_script=None,
                 logfile=None,
                 quiet=False,
                 dryrun=False,
                 unittesting=False):
        '''
        Specifications expected like this
        *Ordered* dict (i.e. sequence of 
        keys and values always the same for
        keys()/values()/items() methods:
        
            {<hparm1> : [val1_1, val1_2, ...],
             <hparm2> : [val2_1, val2_2, ...]
             }
        
        :param starting_config_src: a configuration 
            whose neural net related parameters will 
            be modified below for each run.
        :type starting_config_src: {str | NeuralNetConfig}            
        :param hparms_spec:
        :type hparms_spec:
        :param training_script: path to the training script
            of which to run multiple copies. If None, will
            look in config for Path:train_script.
        :type training_script: {None | str}
        :param logfile: where to log runtime information. If None,
            log to console
        :type logfile: {None | str}
        :param quiet: whether or not to report progress
        :type quiet: bool
        :param unittesting: set to True if unittesting so that
            __init__() will only do a minimum, and allows unittests
            to call other methods individually
        :type bool
        '''

        if logfile is not None:
            self.log = LoggingService(logfile=logfile)
        else:
            self.log = LoggingService()

        self.quiet = quiet

        self.curr_dir = os.path.dirname(__file__)
        self.hostname = socket.getfqdn()
        # No GPUs identified so far:
        self.WORLD_SIZE = 0

        starting_config = NeuralNetConfig(starting_config_src)
        if unittesting:
            # Leave calling of the methods below
            # to the unittests
            return

        self.training_script = training_script
        if training_script is None:
            # Try to find it in config:
            try:
                self.training_script = starting_config.getpath(
                    'Paths', 'train_script', relative_to=self.curr_dir)
            except KeyError:
                raise ValueError(
                    "Did not provide training script path on cmd line or in config"
                )

        self.gpu_landscape = self.obtain_world_map(starting_config)

        # Get list of dicts of hparm-name/hparm_value pairs;
        # one for each of the runs

        the_run_dicts = self.get_runs_hparm_specs(hparms_spec)

        # Turn the run dicts into configurations
        # that that modify the starting config:
        the_run_configs = self.gen_configurations(starting_config,
                                                  the_run_dicts)

        if dryrun:
            print("Dryrun:")
            print(
                f"Would run {len(the_run_dicts)} processes with these configs:"
            )
            for configs in the_run_dicts:

                print(configs)
            return

        # Provide support for cnt-c terminating the training
        # script processes nicely:

        self.cnt_c_received = False
        signal.signal(signal.SIGTERM, self.handle_cnt_c)
        # Start one training script for each configuration:
        self.run_configurations(the_run_configs)
Exemplo n.º 9
0
    def __init__(self,
                 config_info,
                 device=0,
                 percentage=None,
                 debugging=False):
        '''
        
        :param config_info: all path and training parameters
        :type config_info: NeuralNetConfig
        :param debugging: output lots of debug info
        :type debugging: bool
        :param device: number of GPU to use; default is dev 0
            if any GPU is available
        :type device: {None | int}
        :param percentage: percentage of training data to 
            use
        :type percentage: {int | float}
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        if percentage is not None:
            # Integrity check:
            if type(percentage) not in [int, float]:
                raise TypeError(
                    f"Percentage must be int or float, not {type(percentage)}")
            if percentage < 1 or percentage > 100:
                raise ValueError(
                    f"Percentage must be between 1 and 100, not {percentage}")

        if device is None:
            device = 0
            torch.cuda.set_device(device)
        else:
            available_gpus = torch.cuda.device_count()
            if available_gpus == 0:
                self.log.info("No GPU available; running on CPU")
            else:
                if device > available_gpus - 1:
                    raise ValueError(
                        f"Asked to operate on device {device}, but only {available_gpus} are available"
                    )
                torch.cuda.set_device(device)

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.num_folds = self.config.Training.getint('num_folds')
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.device = self.fastest_device
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.initialize_model()

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)

        self.train_loader = self.get_dataloader(sample_width,
                                                sample_height,
                                                perc_data_to_use=percentage)
        self.log.info(f"Expecting {len(self.train_loader)} batches per epoch")
        num_train_samples = len(self.train_loader.dataset)
        num_classes = len(self.train_loader.dataset.class_names())
        self.log.info(
            f"Training set contains {num_train_samples} samples across {num_classes} classes"
        )

        self.class_names = self.train_loader.dataset.class_names()

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here: (num_folds * num-epochs)
        # each for training and validation steps.

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_step = self.train()
            self.visualize_final_epoch_results(final_step)
        finally:
            self.close_tensorboard()
Exemplo n.º 10
0
    def __init__(self, 
                 config, 
                 num_classes,
                 history_len=8,
                 model_root=None,
                 log=None):
        '''
        Constructor:
        
        :param config: configuration structure
        :type config: NeuralNetConfig
        :param num_classes: number of target classes
        :type num_classes: int
        :param history_len: number of model snapshots to 
            maintain
        :type history_len: int
        :param model_root: path to where models
            will be deposited
        :type model_root: str
        :param log: logging service to use. If
            None, create new one for display output
        :type log: LoggingService
        '''

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))
        
        # Model root directory:
        if model_root is None:
            self.model_root = os.path.abspath(
                os.path.join(self.curr_dir, 
                             '../runs_models')
                )
        else:
            self.model_root = model_root

        if os.path.exists(self.model_root) and \
                not os.path.isdir(self.model_root):
            raise FileExistsError(f"{self.model_root} exists but is not a directory")

        # Ensure that intermediate dirs exist:
        try:
            os.makedirs(self.model_root)
        except FileExistsError:
            pass

        if log is None:
            self.log = LoggingService()
        else:
            self.log = log
            
        self.history_len = history_len

        # Create a subdirectory of model_root
        # where this archive keeps its models.
        # The subdir is guaranteed to be unique
        # among model_root's siblings, and it will
        # be created:
        
        self.run_subdir = self._construct_run_subdir(config, 
                                                    num_classes,
                                                    self.model_root)

        # Queue to track models, keeping the 
        # number of saved models to history_len:
        
        self.model_fnames = deque(maxlen=self.history_len)
Exemplo n.º 11
0
class TrainScriptLauncher:

    #------------------------------------
    # Constructor
    #-------------------

    # Use distributed torch default port:
    COMM_PORT = '5678'

    def __init__(self, unittesting=False):

        self.hostname = socket.getfqdn()
        if unittesting:
            # Let unittests create an instance
            # and call individual methods:
            return

        # Logging to console during launch:
        self.log = LoggingService()

        # Convenience: directory of this
        # script, and project root directory
        curr_dir = Path(__file__).parent
        proj_root = curr_dir.joinpath('../..').resolve()
        self.curr_dir = str(curr_dir)
        self.proj_root = str(proj_root)

        args_parser = BirdsTrainingArgumentsParser(
            formatter_class=BirdsTrainingArgumentsParser.
            BlankLinesHelpFormatter,
            description="PyTorch distributed training launch "
            "helper to spawn multiple distributed "
            "birds_train_parallel.py processes")

        all_args = args_parser.parse_args()
        # Separate the args for this launch script
        # from the args destined for the copies of
        # the train script:
        self.launch_args = all_args['launch_args']
        self.script_args = all_args['script_args']

        # Build the gpu_landscape dict:
        self.gather_world_layout(self.launch_args)

        self.GPUS_USED_THIS_MACHINE = self.gpu_landscape[
            self.hostname]['num_gpus']

    #------------------------------------
    # gather_world_layout
    #-------------------

    def gather_world_layout(self, launch_args):
        '''
        # Compute a unique number for each GPU within
        # the group of nodes (machines). Starting with
        # the master node's first GPU as 0 (if master node
        # has a GPU.
        # The resulting GPU layout is assigned to
        # variable gpu_landscape: 
        
        
        :param launch_args:
        :type launch_args:
        '''

        try:
            config_file = launch_args['config']
            if not os.path.exists(config_file):
                raise ConfigError(
                    f"Configuration file {config_file} that was provided as command line arg does not exist."
                )
        except KeyError:
            raise RuntimeError(
                "Error: launch args must include a config file. See config.cfg.Example in project root"
            )

        self.config = DottableConfigParser(config_file)

        # Ensure that the launch_args contains
        # the path to the training script. It
        # will be there if provided on the cmd line.
        # But it may instead be under Path:train_script
        # in the configuration:

        try:
            self.launch_args['training_script']
        except KeyError:
            # The training script was not specified
            # on the command line. Is it in the config
            # file:
            try:
                self.launch_args['training_script'] = self.config.getpath(
                    'Paths', 'train_script', relative_to=self.curr_dict)
            except KeyError:
                raise ValueError(
                    "No training script specified on command line or in config file"
                )

        try:
            self.world_map_path = self.config.getpath(
                'Paths', 'world_map', relative_to=self.curr_dir)
        except KeyError:
            raise RuntimeError(
                f"Could not find entry for 'world_map' in config file {config_file}"
            )

        self.world_map = self.read_world_map(self.world_map_path)
        # Ensure that this machine has an
        # entry in the world_map:
        try:
            # Get this machine's info (sub)dict:
            _my_world_info = self.world_map[self.hostname]
        except KeyError:
            raise ConfigError(
                f"World map file does not contain entry for this machine ({self.hostname})"
            )

        self.compute_landscape = {}

        # Whether or not machine running this
        # code is the master node:
        self.am_master_node = False

        # Build gpu_landscape, which maps
        # machine names to the rank range
        # that they occupy via the number of
        # their GPUs
        #
        #    {machine_name1 : [1],
        #     machine_name2 : [0],
        #     machine_name3 : [1,2,3],

        self.gpu_landscape = self.build_compute_landscape(self.world_map)

        if self.master_hostname is None:
            raise ConfigError(
                f'No master machine in {self.world_map_path}; one entry needs to be "master" : 1'
            )

        # Common pytorch port is either in the config file,
        # or we use the pytorch default
        self.MASTER_PORT = self.config.getint('Parallelism', 'master_port',
                                              self.COMM_PORT)

        # Handle special case: no GPUs anywere, and
        # we are on node 0: in that case start a single
        # copy of the training script. If it is written
        # properly, it will detect the absence of a GPU,
        # and use the CPU. This happens during debugging
        # on a laptop:

        if self.WORLD_SIZE == 0 and self.am_master_node:
            self.WORLD_SIZE += 1

        # If trying to launch on a node without GPUs,
        # when GPUs are available elsewhere, refuse to
        # start the script (is this needed?):
        if not TESTING:
            if self.my_gpus == 0 and self.WORLD_SIZE > 0:
                raise RuntimeError(
                    "This machine does not have any GPU, but others do; training script not started."
                )

    #------------------------------------
    # launch_scripts
    #-------------------

    def launch_scripts(self):
        '''
        Launch (possibly) multiple copies of
        the training script. Use world_map.json
        to know how many, and which GPUs this
        machine is to use.
        
        Each copy is told:
        
            o MASTER_ADDR  # Where to reach the coordinating process
            o MASTER_PORT  # Corresponding port
            o RANK         # The copy's sequence number, which is
                           # Unique across all participating machines
            o LOCAL_RANK   # Which of this machine's GPU to use (0-origin)
            o WORLD_SIZE   # How many GPUs are used on all machines together
            o GPUS_USED_THIS_MACHINE # Number of GPUs *used* on this
                                     # machine, according to the world_map.

        '''

        # Compute a unique number for each GPU within
        # the group of nodes (machines). Starting with
        # the master node's first GPU as 0 (if master node
        # has a GPU.
        # The resulting GPU layout is assigned to
        # variable gpu_landscape:
        #
        #     {<machine_name> :

        # This machine's range of ranks:
        rank_range = self.gpu_landscape[self.hostname]['rank_range']
        this_machine_gpu_ids = self.gpu_landscape[
            self.hostname]['gpu_device_ids']
        min_rank_this_machine = self.gpu_landscape[self.hostname]['start_rank']

        local_rank = 0
        # Map from process object to rank (for debug msgs):
        self.who_is_who = OrderedDict()
        for rank in rank_range:

            cmd = self.training_script_start_cmd(
                rank, len(this_machine_gpu_ids), local_rank,
                min_rank_this_machine, self.launch_args, self.script_args)

            # Copy stdin, and give the copy to the subprocess.
            # This enables the subprocess to ask user whether
            # to save training state in case of a cnt-C:
            newstdin = os.fdopen(os.dup(sys.stdin.fileno()))

            # Spawn one training script.

            process = subprocess.Popen(
                cmd,
                stdin=newstdin,
                stdout=None,  # Script inherits this launch
                stderr=None  # ... script's stdout/stderr  
            )
            self.who_is_who[process] = rank
            local_rank += 1

        if not self.launch_args['quiet']:
            print(
                f"Node {self.hostname} {os.path.basename(sys.argv[0])}: Num processes launched: {len(self.who_is_who)}"
            )
            if self.am_master_node:
                print(f"Awaiting {self.WORLD_SIZE} process(es) to finish...")
            else:
                print(f"Awaiting {self.my_gpus} process(es) to finish...")

        failed_processes = []
        try:
            for process in self.who_is_who.keys():
                process.wait()
                if process.returncode != 0:
                    failed_processes.append(process)
                continue
        except KeyboardInterrupt:
            # Gently kill the training scripts:
            self.handle_cnt_c()
            pass  # See which processes get the interrupt

        num_failed = len(failed_processes)
        if num_failed > 0:
            print(f"Number of failed training scripts: {num_failed}")
            for failed_process in failed_processes:
                train_script = self.launch_args['training_script']
                script_rank = self.who_is_who[failed_process]
                msg = (
                    f"Training script {train_script} (rank {script_rank}) encountered error(s); see logfile"
                )
                print(msg)

    #------------------------------------
    # training_script_start_cmd
    #-------------------

    def training_script_start_cmd(self, rank, gpus_used_this_machine,
                                  local_rank, min_rank_this_machine,
                                  launch_args, script_args):
        '''
        From provided information, creates a legal 
        command string for starting the training script.
        
        :param rank: rank of the script; i.e. it's process' place
            in the sequence of all train script processes
            across all machines
        :type rank: int
        :param gpus_used_this_machine: number of GPU devices to 
            be used, according to the world_map; may be less than
            number of available GPUs
        :type gpus_used_this_machine: int
        :param local_rank: index into the local sequence of GPUs
            for for the GPU that the script is to use
        :type local_rank: int
        :param min_rank_this_machine: the lowest of the ranks among
            the training scripts on this machine
        :type min_rank_this_machine: int
        :param launch_args: command line arguments intended for the
            launch script, as opposed to being destined for the 
            train script
        :type launch_args: {str : Any}
        :param script_args: additional args for the train script
        :type script_args: {str : Any}
        '''

        # Build the shell command line,
        # starting with 'python -u':
        cmd = [sys.executable, "-u"]

        cmd.append(launch_args['training_script'])

        # Add the args for the script that were
        # in the command line:
        for arg_name in script_args.keys():
            script_arg_val = script_args[arg_name]
            if script_arg_val is None or arg_name == 'config':
                # Skip over non-specified CLI args:
                continue
            cmd.append(f"--{arg_name}={script_args[arg_name]}")

        # Add the 'secret' args that tell the training
        # script all the communication parameters:

        cmd.extend([
            f"--MASTER_ADDR={self.MASTER_ADDR}",
            f"--MASTER_PORT={self.MASTER_PORT}", f"--RANK={rank}",
            f"--LOCAL_RANK={local_rank}",
            f"--MIN_RANK_THIS_MACHINE={min_rank_this_machine}",
            f"--WORLD_SIZE={self.WORLD_SIZE}",
            f"--GPUS_USED_THIS_MACHINE={gpus_used_this_machine}"
        ])

        # Finally, the obligatory non-option arg
        # to the training script: the configuration
        # file:

        config_file_name = script_args['config']
        cmd.append(config_file_name)

        self.log.debug(f"****** Launch: the cmd is {cmd}")
        return cmd

    #------------------------------------
    # read_world_map
    #-------------------

    def read_world_map(self, path):
        '''
        Read the JSON5 world map file, and 
        return a corresponding dict. JSON5
        allows something like:
        
        /*
            This is a block comment.
            Notice the lacking quote
            chars around the keys below.
            The are optional in JSON5
            
        */
        
        {quintus.stanford.edu : {
            "master" : Yes
            "gpus" : 2
         },
        
         quatro.stanford.edu  : {
             "gpus" : 2,
             "devices" : [1,2]
         }
        }
        
        BUT: JSON5 gets angry at dots in the 
             keys. 
        So we first read the file, and try to find 
        the machine names. We temporarily replace
        them with an acceptable marker, and then 
        convert back.
                
        :param path: path to world map file
        :type path: string
        '''
        dot_substitute = '___'

        try:
            # Read all the world map file lines:
            with open(path, 'r') as world_map_fd:
                tmp_world_map = world_map_fd.readlines()
        except IOError as e:
            raise IOError(f"World map file at {path} not found") from e

        # Replace occurrences of '.' with dot_substitute:
        new_text = []
        for line in tmp_world_map:
            new_text.append(line.replace('.', dot_substitute))

        # ... and make one string from all the lines:
        json_str = '\n'.join(new_text)

        try:
            # Hopefully, JSON5 will eat it now:
            world_map_almost = json5.loads(json_str)
        except JSONError as e:
            raise JSONError(
                f"World map file at {path} contains bad JSON") from e

        # Need to fix all the dot substitutions.
        # At this point the data structure is
        #    { <machine_name> : {spec_attr1 : val1,
        #                        spec_attr2 : val2,
        #                       }
        #    }

        # Fix the machine names first:
        mach_names_fixed = [
            machine_name.replace(dot_substitute, '.')
            for machine_name in world_map_almost.keys()
        ]

        machine_specs_fixed = []

        # Now dig into each of the nested machine spec
        # dicts, and fix attrs and values there:
        for spec in world_map_almost.values():
            # Spec is a dict nested inside the outer one:
            spec_fixed = {
                key.replace(dot_substitute, '.'): val.replace(
                    dot_substitute, '.') if isinstance(val, str) else val
                for key, val in spec.items()
            }
            machine_specs_fixed.append(spec_fixed)

        # Put it all together:
        world_map = {
            machine_name: spec_dict
            for machine_name, spec_dict in zip(mach_names_fixed,
                                               machine_specs_fixed)
        }

        return world_map

    #------------------------------------
    # build_compute_landscape
    #-------------------

    def build_compute_landscape(self, world_map):
        '''
        # Using the world_map.json config file, build 
        # a dict self.gpu_landscape like this:
        #
        #    {'machine_name1' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    {'machine_name2' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    } 
        #
        # Also sets 
        #     o self.master_hostname, the hostname
        #       running the one process that coordinates all others.
        #     o self.WORLD_SIZE, number of GPUs used across all machines
        #     o self.my_gpus, the number of GPUs on this machine
        
        :param world_map:
        :type world_map:
        :return: information about how many GPUs are
            on each node
        :rtype: OrderedDict
        '''

        if not self.hostname in world_map.keys():
            raise ConfigError(
                f"World map does not contain an entry for this machine {self.hostname}"
            )

        # World size is the number of training script processes,
        # which is equal to number of GPUs used on all participating
        # machines combined:

        # Number of GPUs across all machines:
        self.WORLD_SIZE = 0

        self.master_hostname = None

        # Go through the world map, machine (a.k.a. node)
        # one at a time, in alpha order of the machine
        # names to ensure all copies of this script
        # come to the same conclusions about ranks

        # Build gpu_landscape:
        #
        #    {'machine_name1' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    {'machine_name2' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    }
        #
        # The structure is an OrderedDict(), containing
        # machines alphabetically by name. This discipline
        # is required so that all copies of this launch script
        # (one copy per machine) arrive at the same ordering of
        # GPUs:

        gpu_landscape = OrderedDict({})

        for machine_name in sorted(world_map.keys()):

            # Get dict of info about the machine:

            machine_info = world_map[machine_name]

            try:
                machine_gpus = machine_info['gpus']
            except KeyError:
                print(
                    "World map must include a 'gpus' entry; the value may be 0"
                )

            gpu_landscape[machine_name] = {}
            gpu_landscape[machine_name]['num_gpus'] = machine_gpus

            # List of GPU numbers to use is optional
            # in world_maps:

            machine_gpus_to_use = machine_info.get('devices', None)
            if machine_gpus_to_use is None:
                # Use all GPUs on that machine:
                machine_gpus_to_use = list(range(machine_gpus))

            gpu_landscape[machine_name]['gpu_device_ids'] = machine_gpus_to_use

            # Accept all kinds of affirmatives as values:
            # for identification of the master node entry:

            is_master_node = machine_info.get('master', False) \
                in [1, 'True', 'true', 'Yes', 'yes']

            if is_master_node:
                self.master_hostname = machine_name
                if machine_name == self.hostname:
                    self.am_master_node = True
                try:
                    self.MASTER_ADDR = socket.gethostbyname(machine_name)
                except socket.gaierror:
                    # For machines that have no
                    # findable IP address:
                    self.MASTER_ADDR = '127.0.0.1'

            self.WORLD_SIZE += machine_gpus

        # Go through the machine enries in gpu_landscape, and
        # assign rank ranges to each. Must start with
        # the master node, b/c it must start with rank 0.
        # For the master node, it is possible that it has
        # no GPUs

        master_info = gpu_landscape[self.master_hostname]
        master_info['rank_range'] = list(range(master_info['num_gpus']))
        master_info['start_rank'] = 0
        if len(master_info['rank_range']) == 0:
            # Master only has a GPU:
            master_info['rank_range'] = [0]

        # Start assigning more ranks after
        # the GPUs of the master:

        running_rank = master_info['rank_range'][-1] + 1

        for machine_name in gpu_landscape.keys():
            if machine_name == self.master_hostname:
                # We already did the master node
                continue
            mach_info = gpu_landscape[machine_name]
            mach_info['start_rank'] = running_rank
            num_gpus = mach_info['num_gpus']
            range_bound = running_rank + (num_gpus if num_gpus > 0 else 1)
            mach_info['rank_range'] = list(range(running_rank, range_bound))
            running_rank += (num_gpus if num_gpus > 0 else 1)

        self.my_gpus = gpu_landscape[self.hostname]['num_gpus']
        self.gpu_landscape = gpu_landscape
        return gpu_landscape

    #------------------------------------
    # handle_cnt_c
    #-------------------

    def handle_cnt_c(self):
        '''
        Given a list of process instances,
        Send SIGINT (cnt-C) to them:
        :param procs:
        :type procs:
        '''
        # Line processes up, highest rank first,
        # master process last:

        procs_terminate = sorted([proc for proc in self.who_is_who.keys()],
                                 key=lambda obj: self.who_is_who[obj],
                                 reverse=True)

        for process in procs_terminate:
            # If process is no longer running,
            # forget about it:
            if process.poll is not None:
                # Process dead:
                continue
            process.send_signal(signal.SIGTERM)
            process.wait()
Exemplo n.º 12
0
class SpectrogramChopper:
    '''
    Processes directories of .png files,
    chopping them into window_len seconds snippets.

    Assumes:

                        self.input
                        
          Species1        Species2   ...     Speciesn
        spectro1_1.png     spectro2_1.png     spectro_n_1.png
        spectro1_2.png     spectro2_2.png     spectro_n_2.png
                            ...
                            
    Saves the snippets in a new directory.
    
        Resulting directories under self.out_dir will be:
         
                         self.out_dir
          Species1        Species2   ...     Speciesn
         snip_1_1_1         snip_2_1_1        snip_n_1_1
         snip_1_1_2         snip_2_1_2        snip_n_1_2
         snip_1_1_3         snip_2_1_3        snip_n_1_3
                            snip_2_1_4
            ...
         snip_1_2_1         snip_2_2_1        snip_n_2_1
         snip_1_2_2         snip_2_2_2        snip_n_2_2
         snip_1_2_3                           snip_n_2_3
         snip_1_2_4

    With snip_b_f_s: 
       o b is the bird species (manifesting in the file system
                                  as one subdirectory under self.out_dir)
       o f is one spectrogram of a full audio recording
       o s is a snippet number.
         
    Note the possibility of different numbers of snippets
    in each species, and for each original audio recording 
    (which may be of unequal lengths).

    Because many spectrograms are created, speed requirements
    call for the use of parallelism. Since each audio file's processing
    is independent from the others, the multiprocessing library
    is used as follows.
    
        - If command line arg --workers is set to 1, no parallelism
          is used. 
        - If multiple cores are available, some percentage of 
          of them will be deployed to chopping. Each core runs 
          a separate copy of this file. The percentage is controlled
          by MAX_PERC_OF_CORES_TO_USE.
        
    Method chop_all() is used in the single core scenario.
    Method chop_from_file_list() is used when multiprocessing. This
    method is the 'target' in the multiprocessing library's sense.

    '''

    # Common logger for all workers:
    log = LoggingService()

    MIN_SNIPPET_WIDTH = 256
    '''Minimum width of spectrogram snippets to satisfy the 
       torchvision pretrained model minimum value of 
       224 for both width and height'''

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 in_dir_or_spectro_file,
                 outdir,
                 specific_species=None,
                 overwrite_policy=WhenAlreadyDone.ASK):
        '''
        The overwrite_policy is one of the WhenAlreadyDone
        enum members: ASK, OVERWRITE, SKIP. If ASK,
        request user's permission for each encountered
        destination file. SKIP should be used when resuming
        an interrupted chopping session. Any sound file
        whose destination spectrogram exists is not processed
        again.
        
        @param in_dir_or_spectro_file: location of spectrogram root
        @type in_dir_or_spectro_file: str
        @param outdir: root of spectrograms to create
        @type outdir: src
        @param specific_species: process only a spectific list of species
        @type specific_species: {None | [str]}
        @param overwrite_policy: what to do when an output file already exists
        @type overwrite_policy: WhenAlreadyDone
        '''

        # Ensure the outdir and all its intermediate dirs exist:
        os.makedirs(outdir, exist_ok=True)

        self.in_dir = in_dir_or_spectro_file if os.path.isdir(
            in_dir_or_spectro_file) else None
        self.out_dir = outdir
        self.specific_species = specific_species
        self.overwrite_policy = overwrite_policy

        self.num_chopped = 0

        # The following used to be less convoluted until
        # the option to chop just a single spectro, rather
        # than a list of subdirectories with spectros
        # in them. Sorry! :-(

        if self.in_dir is not None:
            # We are to process entire directory tree,
            # not just a single file:
            if self.specific_species is None:
                # Process all (species) subdirectories:
                self.species_list = os.listdir(self.in_dir)
            else:
                # Only process certain species:
                self.species_list = self.specific_species
            # Create destination directories for new spectrogram
            # snippets, so that the dest tree will mirror the in tree:
            self.spectrogram_dir_path = self.create_dest_dirs(
                self.species_list)
        else:
            # Just do a single spectro, no need for destination
            # subdirs:
            self.spectrogram_dir_path = outdir

        # Allow others outside the instance to find the spectros:
        SpectrogramChopper.spectrogram_dir_path = self.spectrogram_dir_path

    #------------------------------------
    # chop_from_file_list
    #-------------------

    @classmethod
    def chop_from_file_list(cls,
                            assignments,
                            in_dir,
                            out_dir,
                            global_info,
                            overwrite_policy,
                            return_bool,
                            env=None):
        '''
        Takes a list like:
        
           [(s1,f1),(s1,f2),(s4,f3)]
           
        where s_n is a species name, and f_m
        is the basename of a spectrogram file to chop.
        Example: foobar.png
        
        Returns True if all went well, else
        raises exception.
        
        Wrinkle: this method is called under two 
        very different scenarios (S1/S2). S1 is
        when the process started by the user calls
        this method. That happens when the command
        line arg --workers is set to 1, or on a machine
        where few enough cores are available that only
        one is used. In that case, env is left at None,
        and all is as normal.
        
        S2 occurs when the initial process (the one started
        from the command line) starts a new Process. That
        process normally contains a new environment, i.e. 
        some default value for all the environment variables.
        ************ NEEDED WITHOUT LIBROSA USAGE?
        In particular, DISPLAY and PYTHONPATH will not be
        what is needed. The result is that all spectrogram
        creating methods fail, because they cannot find a
        graphics backend. 
        
        In that case kwarg env is set to the environment of the 
        initiating process. At the start of this method this
        process' default environ is then set to match that
        of the initiating process.
        
        :param assignments: list of species/filename pairs
        :type assignments: [(str,str)]
        :param env: if provided, the environment of the
            parent process. If None, the current env
            is retained
        :type env: {str : Any}
        :param return_bool:
        :type return_bool:
        '''

        # During multiprocessing this method is
        # the 'target', i.e. the entry point for
        # each child. In that case env will be
        # the environment of the initiating process.
        # We adopt that environment for this new,
        # forked process as well:

        if env is not None:
            os.environ = env

        # Optimism!
        return_bool.value = True

        for species_name, fname in assignments:
            # Ex. species_name: AMADEC
            # Ex. fname       : dysmen_my_bird.png
            full_spectro_path = os.path.join(in_dir, species_name, fname)
            try:
                cls.chop_one_spectro_file(full_spectro_path,
                                          os.path.join(out_dir, species_name),
                                          species_name,
                                          overwrite_policy=overwrite_policy)
            except Exception as e:
                return_bool.value = False
                cls.log.err((f"One file could not be processed \n"
                             f"    ({full_spectro_path}):\n"
                             f"    {repr(e)}"))
                continue

    #------------------------------------
    # chop_one_spectro_file
    #-------------------

    @classmethod
    def chop_one_spectro_file(
        cls,
        spectro_fname,
        out_dir,
        species_name,
        window_len=5,
        skip_size=2,
        original_duration=None,
        overwrite_policy=WhenAlreadyDone.ASK,
    ):
        """
        Generates window_len second spectrogram snippets
        from spectrograms files of arbitrary length. 
        
        To compute the number of time slices to extract
        for each snippet, the time_slices of the spectrogram time
        slices in fractional seconds must be known. The time_slices
        can be approximated if the play length of the underlying
        audio is known (even if the precise fft settings are unavailable).
        
        If the given .png file contains metadata with a 'duration' 
        key, then the corresponding value is used as the duration of 
        the original audio file in fractional seconds. This metadata
        will be present if the .png file was created with the 
        SoundProcessor.create_spectrogram(). 
        
        To enable use of spectrogram images created elsewhere, callers
        can instead supply original_duration in fractional seconds.
        
        For now, if neither the embedded metadata, nor the original_duration
        is supplied, a ValueError is raised. 
    
        :param spectro_fname: full path to spectrogram file to chop
        :type spectro_fname: str
        :param out_dir: root directory under which spectrogram
            snippets will be saved (in different subdirs)
        :type out_dir: str
        :param species_name: name of species to embed in the 
            metadata of this snippet, and use for determining
            subdirectory where to place the snippet
        :type species_name: str
        :param window_len: number of seconds to be covered by each snippet
        :type window_len: int
        :param skip_size: number of seconds to shift right in 
            time for the start of each chop
        :type skip_size: int
        :param original_duration:
        :raise ValueError: if neither embedded duration metadata is found
            in the given file, nor original_duration is provided
        """

        # Read the spectrogram, getting an np array:
        spectro_arr, metadata = SoundProcessor.load_spectrogram(spectro_fname)
        duration = metadata.get('duration', None)

        if duration is None:
            if original_duration is None:
                raise ValueError(
                    f"Time duration of original recording cannot be determined for {spectro_fname}"
                )
            else:
                duration = float(original_duration)
        else:
            duration = float(duration)

        # If original file is already at or below
        # the single window length, it's a snippet
        # in itself. Copy it to the output with an
        # appropriate snippet name to match the other
        # snippets: wall start time is zero:

        if duration < window_len:
            # No partial snippets
            return
        # Note: Also have sample rate ('sr') and species ('label')
        # in the metadata, but don't need those here.

        _freq_bands, time_slices = spectro_arr.shape
        # Time in fractions of second
        # per spectrogram column:
        twidth = duration / time_slices

        # Integer of duration (which is in seconds):
        time_dur_int = int(np.ceil(duration))
        time_upper_bound = 1 + time_dur_int - skip_size

        # Caller specifies skip_size and window
        # length in *seconds*. Convert to spectrogram
        # time slices (with rounding error):

        samples_win_len = int(window_len // twidth)
        # Does samples_win_len satisfy the
        # minimum spectrogram snippet width for
        # pretrained models?
        samples_win_len = max(cls.MIN_SNIPPET_WIDTH, samples_win_len)

        time_true_each_snippet = samples_win_len * twidth

        samples_skip_size = int(skip_size // twidth)
        samples_upper_bound = int(time_upper_bound // twidth)

        assert (samples_upper_bound <= time_slices)

        for _snip_num, samples_start_idx in enumerate(
                range(0, samples_upper_bound, samples_skip_size)):

            # Absolute start time of this snippet
            # within the entire spectrogram:
            wall_start_time = samples_start_idx * twidth
            # Create a name for the snippet file:
            snippet_path = cls.create_snippet_fpath(spectro_fname,
                                                    round(wall_start_time),
                                                    out_dir)

            spectro_done = os.path.exists(snippet_path)

            if spectro_done:
                if overwrite_policy == WhenAlreadyDone.SKIP:
                    # Next snippet:
                    continue
                elif overwrite_policy == WhenAlreadyDone.ASK:
                    if not Utils.user_confirm(
                            f"Snippet {Path(snippet_path).stem} exists, overwrite?",
                            default='N'):
                        continue

            # Chop: All rows, columns from current
            #       window start for window lenth samples:
            snippet_data = spectro_arr[:, samples_start_idx:samples_start_idx +
                                       samples_win_len]
            _num_rows, num_cols = snippet_data.shape
            if num_cols < samples_win_len:
                # Leave that little spectrogram
                # snippet leftover for Elijah:
                break

            snippet_info = metadata.copy()
            # Add the
            snippet_info['duration(secs)'] = samples_win_len * twidth
            snippet_info['start_time(secs)'] = wall_start_time
            snippet_info['end_time(secs)'] = wall_start_time + (
                samples_win_len * twidth)
            snippet_info['species'] = species_name
            SoundProcessor.save_image(snippet_data, snippet_path, snippet_info)
        return time_true_each_snippet

    #------------------------------------
    # create_dest_dirs
    #-------------------

    def create_dest_dirs(self, species_list):
        '''
        Creates all directories that will hold new 
        spectrogram snippets for each species.
        For each directory: if dir exists:
        
           o if overwrite_policy is True, wipe the dir
           o if overwrite_policy is SKIP, leave the
               directory in place, contents intact 
           o else ask user. 
                If response is Yes, wipe the dir
                else raise FileExistsError
                
        :param species_list: names of species to process
        :type species_list: [str]
        :return: top level dir for spectrograms (same as self.out_dir)
        :rtype: (str)
        :raise FileExistsError: if a dest dir exists and not allowed
            to wipe it.
        '''

        # Root dir of each species' spectro snippets:
        Utils.create_folder(self.out_dir,
                            overwrite_policy=self.overwrite_policy)

        # One dir each for the spectrogram snippets of one species:

        for species in species_list:
            species_spectros_dir = os.path.join(self.out_dir, species)
            if not Utils.create_folder(species_spectros_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(
                    f"Target dir {species_spectros_dir} exists; aborting")

        return self.out_dir

    #------------------------------------
    # create_snippet_fpath
    #-------------------

    @classmethod
    def create_snippet_fpath(cls, origin_nm, wall_start_time, out_dir):
        '''
        Given constituent elements, construct the
        full output path of a new spectrogram snippet.
        
        Name format if full-length spectrogram file were
        named my_file.png:
        
              my_file_sw-start123.png
              
        where 123 is the snippet's start time in seconds
        from the beginning of the full length file
        
        :param origin_nm: name of full length file; either full path or
            just the file name are fine
        :type origin_nm: str
        :param wall_start_time: snippet start time from beginning
            of full length spectrogram
        :type wall_start_time: int
        :param out_dir: destination directory
        :type out_dir: str
        :return: full path to the future snippet's destination
        :rtype: str
        '''

        # Prepare snippet file name creation:

        #   From '/foo/bar/infile.png'
        # make 'infile'
        snippet_name_stem = Path(origin_nm).stem

        snippet_name = f"{snippet_name_stem}_sw-start{str(wall_start_time)}.png"
        snippet_path = os.path.join(out_dir, snippet_name)
        return snippet_path

    # -------------------- Class Methods ------------

    #------------------------------------
    # compute_worker_assignments
    #-------------------

    @classmethod
    def compute_worker_assignments(cls,
                                   in_dir,
                                   dst_dir,
                                   overwrite_policy=WhenAlreadyDone.ASK,
                                   num_workers=None):
        '''
        Given the root directory of a set of
        directories whose names are species,
        and which contain spectrograms by species,
        return a multi processing worker assignment.
        
        Expected:
                         in_dir

          Species1        Species2   ...     Speciesn
           smpl1_1.png      smpl2_1.png         smpln_1.png
           smpl1_2.png      smpl2_2.png         smpln_2.png
                            ...
        
        Collects number of spectrograms available for
        each species. Creates a list of species name
        buckets such that all workers asked to process
        one of the buckets, will have roughly equal
        amounts of work.
        
        Example return:
            
            [['Species1', 'Species2], ['Species3', 'Species4', 'Species5']]
            
        The caller can then assign the first list to
        one worker, and the second list to another worker.
        
        The number of buckets, and therefore the number
        of eventual workers may be passed in. If None, 
        80% of the cores available on the current machine
        will be assumed. If num_workers is provided and
        the number is larger than the number of available
        cores, the number is reduced to the number of cores.
        
        Also returned is the number of workers on which the
        computation is based. This number is always the same
        as the number of species name lists in the return.
        But for clarity, the number is returned explicitly.

        :param in_dir: root of species recordings
        :type in_dir: str
        :param num_workers: number of buckets into which to partition 
        :type num_workers: {int | None}
        :return: list of species name lists, and number of workers.
        :rtype: ([[int]], int)
        '''

        # Create:
        #     {species : num-recordings}
        #     {species : recordings_dir}
        #     [(species1, fpath1), (species1, fpath2), (species2, fpath3)...]

        sample_size_distrib = OrderedDict({})
        sample_dir_dict = {}
        species_file_tuples = []

        for _dir_name, subdir_list, _file_list in os.walk(in_dir):
            for species_name in subdir_list:
                species_spectros_dir = os.path.join(in_dir, species_name)
                spectro_paths = os.listdir(species_spectros_dir)

                # Create new spectro_paths with only spectro files that
                # need chopping:
                new_rec_paths = cls.cull_spectro_paths(species_name, dst_dir,
                                                       spectro_paths,
                                                       overwrite_policy)

                sample_size_distrib[species_name] = len(spectro_paths)
                sample_dir_dict[species_name] = species_spectros_dir
                species_file_pairs = list(
                    zip([species_name] * len(new_rec_paths), new_rec_paths))

                species_file_tuples.extend(species_file_pairs)
            break

        if len(species_file_tuples) == 0:
            # If no subdirectories with spectrograms were
            # found, warn:
            cls.log.warn(
                (f"\n"
                 f"    All spectrograms in {in_dir} already chopped.\n"
                 f"    Or did you mean to create an individual file\n"
                 f"    rather than a set of species subdirs?"))

        num_cores = mp.cpu_count()
        # Use 80% of the cores:
        if num_workers is None:
            num_workers = round(num_cores * Utils.MAX_PERC_OF_CORES_TO_USE /
                                100)
        elif num_workers > num_cores:
            # Limit pool size to number of cores:
            num_workers = num_cores

        # Create a partitioning into equal sized files,
        # regardless of species association.

        assignments = cls.partition_by_recordings(species_file_tuples,
                                                  num_workers)
        num_workers_used = len(assignments)
        return assignments, num_workers_used

    #------------------------------------
    # partition_by_recordings
    #-------------------

    @classmethod
    def partition_by_recordings(cls, species_file_pairs, num_workers):
        '''
        Given a list of species-name/file-path tuples, 
        partition that list into num_workers sublists,
        such that each list contains roughly the same
        number of tuples. If the number of species_file_pairs
        tuples is not divisible by num_workers, the left-over
        tuples are distributed over the first sublists.

        :param species_file_pairs:
        :type species_file_pairs:
        :param num_workers:
        :type num_workers:
        :return partitioning of the species_file_pairs tuples
        :rtype: [[(str, str)]]
        '''

        # Compute near-equal number of files per worker:
        num_spectros = len(species_file_pairs)
        spectros_per_worker = int(np.ceil(num_spectros / num_workers))

        # Create list of species-file pair lists:
        #    [[(s1,f1), (s1,f2)], [s1,f3,s2:f4], ...]
        # Each inner list will be handled by one worker:

        assignments = []
        assign_idx = 0
        for _worker_idx in range(num_workers):
            assign_sublist = species_file_pairs[assign_idx:assign_idx +
                                                spectros_per_worker]
            assignments.append(assign_sublist)
            assign_idx += spectros_per_worker

        num_tasks = sum([len(ass) for ass in assignments])
        # The following seems never to happen, but
        # too tired to figure out why:
        left_overs = num_spectros - num_tasks
        if left_overs > 0:
            # Can't have more than num_workers left overs,
            # meaning can't have more leftovers than
            # sublists. Distribute the leftovers:=

            for idx, left_over in enumerate(species_file_pairs[-left_overs:]):
                assignments[idx].append(left_over)

        # Remove empty assignments:
        assignments = [ass for ass in assignments if len(ass) > 0]
        return assignments

    #------------------------------------
    # run_workers
    #-------------------

    @classmethod
    def run_workers(cls,
                    args,
                    global_info,
                    overwrite_policy=WhenAlreadyDone.ASK):
        '''
        Called by main to run the SpectrogramChopper in
        multiple processes at once. Partitions the
        audio files to be processed; runs the chopping
        while giving visual progress on terminal.
        
        Prints success/failure of each worker. Then
        returns. In order to avoid processes repeatedly
        reporting the same, or only locally kept info,
        the globally visible dict `global_info` is passed in.
        
        This method will add these key/val pairs:
        
           1 The total number of spectros to chop (key 'num_tasks')
           2 The number of already created snippets (key 'num_snips')
           3 A list with values False for each job, indicating
               that the corresponding job is not yet done (key 'jobs_status')

        Processes will update 2 and 3 as they report progress: 

        :param args: all arguments provided to argparse
        :type args: {str : Any}
        :param global_info: interprocess communication
            dict for reporting progress
        :type global_info: multiprocessing.manager.dict
        '''

        # Get a list of lists of species names
        # to process. The list is computed such
        # that each worker has roughly the same
        # number of recordings to chop. We let
        # the method determine the number of workers
        # by using 80% of the available cores.

        (worker_assignments,
         num_workers) = SpectrogramChopper.compute_worker_assignments(
             args.input, args.outdir, num_workers=args.workers)

        print(f"Distributing workload across {num_workers} workers.")

        # Initialize the dict with shared information:

        # Fill the inter-process list with False.
        # Will be used to logging jobs finishing
        # many times to the console (i.e. not used
        # for functions other than reporting progress:

        for _i in range(num_workers):
            # NOTE: reportedly one cannot just set the passed-in
            #       list to [False]*num_workers, b/c
            #       a regular python list won't be
            #       seen by other processes, even if
            #       embedded in a multiprocessing.manager.list
            #       instance:
            global_info['jobs_status'].append(False)

        # Number of full spectrograms to chop:
        global_info['snips_to_do'] = len(
            Utils.find_in_dir_tree(args.input, pattern="*.png"))

        # For progress reports, get number of already
        # existing .png files in out directory:
        global_info['snips_done'] = len(
            Utils.find_in_dir_tree(args.outdir, pattern="*.png"))

        # Assign each list of species to one worker:

        chopping_jobs = []
        for ass_num, assignment in enumerate(worker_assignments):
            chopper = SpectrogramChopper(args.input,
                                         args.outdir,
                                         overwrite_policy=overwrite_policy)
            ret_value_slot = mp.Value("b", False)
            job = ProcessWithoutWarnings(
                target=chopper.chop_from_file_list,
                args=(
                    assignment,
                    args.input,
                    args.outdir,
                    global_info,  # ***NEW
                    overwrite_policy,
                    ret_value_slot),
                name=f"ass# {ass_num}")
            job.ret_val = ret_value_slot

            chopping_jobs.append(job)
            print(f"Starting chops for {job.name}")
            job.start()

        start_time = datetime.datetime.now()

        # Keep checking on each job, until
        # all are done as indicated by all jobs_done
        # values being True, a.k.a valued 1:

        while sum(global_info['jobs_status']) < num_workers:
            for job_idx, job in enumerate(chopping_jobs):
                # Timeout 1 sec
                job.join(1)
                if job.exitcode is not None:
                    if global_info['jobs_status'][job_idx]:
                        # One of the processes has already
                        # reported this job as done. Don't
                        # report it again:
                        continue

                    # Let other processes know that this job
                    # is done, and they don't need to report
                    # that fact: we'll do it here below:
                    global_info['jobs_status'][job_idx] = True

                    # This job finished, and that fact has not
                    # been logged yet to the console:

                    res = "OK" if job.ret_val else "Error"
                    # New line after the single-line progress msgs:
                    print("")
                    print(
                        f"Worker {job.name}/{num_workers} finished with: {res}"
                    )
                    global_info['snips_done'] = cls.sign_of_life(
                        job,
                        global_info['snips_done'],
                        args.outdir,
                        start_time,
                        force_rewrite=True)
                    # Check on next job:
                    continue

                # This job not finished yet.
                # Time for sign of life?
                global_info['snips_done'] = cls.sign_of_life(
                    job,
                    global_info['snips_done'],
                    args.outdir,
                    start_time,
                    force_rewrite=True)

    #------------------------------------
    # cull_spectro_paths
    #-------------------

    @classmethod
    def cull_spectro_paths(cls,
                           species_or_recorder_name,
                           dst_dir,
                           rec_paths,
                           overwrite_policy=WhenAlreadyDone.ASK):
        #******* DISABLED ************
        # method analogous to cull_rec_paths() in create_spectrograms()
        # Currently below is just a copy from create_spectrograms().
        # If we end up needing culling, update this body
        return rec_paths
        #******* DISABLED ************
        # NEVER REACHED
        new_rec_paths = []
        for aud_fname in rec_paths:
            fname_stem = Path(aud_fname).stem
            dst_path = os.path.join(dst_dir, species_or_recorder_name,
                                    f"{fname_stem}.png")
            if not os.path.exists(dst_path):
                # Destination spectrogram does not exist;
                # keep this audio file in the to-do list:
                new_rec_paths.append(aud_fname)
                continue
            if overwrite_policy == WhenAlreadyDone.OVERWRITE:
                os.remove(dst_path)
                new_rec_paths.append(aud_fname)
                continue
            if overwrite_policy == WhenAlreadyDone.SKIP:
                # Don't even assign audio file to a worker,
                # since its spectro already exists:
                continue
            if overwrite_policy == WhenAlreadyDone.ASK:
                if Utils.user_confirm(
                        f"Spectrogram for {dst_path} exists; overwrite?"):
                    os.remove(dst_path)
                    new_rec_paths.append(aud_fname)
                    continue
        return new_rec_paths

    #------------------------------------
    # sign_of_life
    #-------------------

    @classmethod
    def sign_of_life(cls,
                     job,
                     num_already_present_imgs,
                     outdir,
                     start_time,
                     force_rewrite=False):

        # Time for sign of life?
        now_time = datetime.datetime.now()
        time_duration = now_time - start_time
        # Every 3 seconds, but at least 3:
        if force_rewrite \
           or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0):

            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(time_duration,
                                                    granularity=4)

            # Get current and new spectro imgs in outdir:
            num_now_present_imgs = len(
                Utils.find_in_dir_tree(outdir, pattern="*.png"))
            num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs

            # Keep printing number of done snippets in the same
            # terminal line:
            print((f"{job.name}---Number of spectros: {num_now_present_imgs} "
                   f"({num_newly_present_imgs} new) after {duration_str}"),
                  end='\r')
            return num_newly_present_imgs
        else:
            return num_already_present_imgs
Exemplo n.º 13
0
    def __init__(self, 
                 in_dir_or_spectro_file, 
                 outdir, 
                 specific_species=None,
                 overwrite_policy=WhenAlreadyDone.ASK,
                 generate_wav_files=False
                 ):
        '''
        The overwrite_policy is one of the WhenAlreadyDone
        enum members: ASK, OVERWRITE, SKIP. If ASK,
        request user's permission for each encountered
        destination file. SKIP should be used when resuming
        an interrupted chopping session. Any sound file
        whose destination spectrogram exists is not processed
        again.
        
        If generate_wav_files is True, a .wav file is created
        for every window of the source soundfile. Usually
        not necessary.
        
        The window_size is the number of seconds by which a
        sliding window is moved across the source soundfile
        before a spectrogram is created.
          
        
        @param in_dir_or_spectro_file: location of soundfile root
        @type in_dir_or_spectro_file: str
        @param outdir: root of spectrograms/wav_files to create
        @type outdir: src
        @param specific_species: process only a spectific list of species
        @type specific_species: {None | [str]}
        @param overwrite_policy: what to do when an output file already exists
        @type overwrite_policy: WhenAlreadyDone
        '''

        self.in_dir         	= in_dir_or_spectro_file
        self.out_dir        	= outdir
        self.specific_species   = specific_species
        self.overwrite_policy   = overwrite_policy
        self.generate_wav_files = generate_wav_files
        
        self.log = LoggingService()
        
        self.num_chopped = 0

        # Don't show the annoying deprecation
        # librosa.display() warnings about renaming
        # 'basey' to 'base' to match matplotlib: 
        warnings.simplefilter("ignore", category=MatplotlibDeprecationWarning)
        
        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        warnings.filterwarnings(action="ignore",
                                message="PySoundFile failed. Trying audioread instead.",
                                category=UserWarning, 
                                module='', 
                                lineno=0)

        if self.specific_species is None:
            self.species_list = os.listdir(self.in_dir)
        else:
            self.species_list = self.specific_species
        
        # Create directories for new audio snippets
        # and spectrograms:
        
        self.wav_dir_path, self.spectrogram_dir_path = self.create_dest_dirs(self.species_list)
        
        # Allow others outside the instance
        # find the audio snippet destination
        SpectrogramChopper.wav_dir_path = self.wav_dir_path
        SpectrogramChopper.spectrogram_dir_path = self.spectrogram_dir_path
Exemplo n.º 14
0
class SpectrogramChopper:
    '''
    Processes directories of .wav or .mp3 files,
    chopping them into window_len seconds snippets.
    Each audio snippet is saved, and spectrograms
    are created for each.
    
    Assumes:

                        self.in_dir
                        
          Species1        Species2   ...     Speciesn
           smpl1_1.mp3      smpl2_1.mp3         smpln_1.mp3
           smpl1_2.mp3      smpl2_2.mp3         smpln_2mp3
                            ...
                            
    Saves the snippets in a new directory. Creates a spectrogram 
    for each snippet, and saves those in a different, new directory.
        
        Resulting directories under self.out_dir will be:
         
                         self.out_dir
            spectrograms               wav-files

    Because many spectrograms are created, speed requirements
    call for the use of parallelism. Since each audio file's processing
    is independent from the others, the multiprocessing library
    is used as follows.
    
        - If command line arg --workers is set to 1, no parallelism
          is used. 
        - If multiple cores are available, some percentage of 
          of them will be deployed to chopping. Each core runs 
          a separate copy of this file. The percentage is controlled
          by MAX_PERC_OF_CORES_TO_USE.
        
    Method chop_all() is used in the single core scenario.
    Method chop_from_file_list() is used when multiprocessing. This
    method is the 'target' in the multiprocessing library's sense.

    '''

    # If multiple cores are available,
    # only use some percentage of them to
    # be nice:
    
    MAX_PERC_OF_CORES_TO_USE = 50

    #------------------------------------
    # Constructor 
    #-------------------
    
    def __init__(self, 
                 in_dir_or_spectro_file, 
                 outdir, 
                 specific_species=None,
                 overwrite_policy=WhenAlreadyDone.ASK,
                 generate_wav_files=False
                 ):
        '''
        The overwrite_policy is one of the WhenAlreadyDone
        enum members: ASK, OVERWRITE, SKIP. If ASK,
        request user's permission for each encountered
        destination file. SKIP should be used when resuming
        an interrupted chopping session. Any sound file
        whose destination spectrogram exists is not processed
        again.
        
        If generate_wav_files is True, a .wav file is created
        for every window of the source soundfile. Usually
        not necessary.
        
        The window_size is the number of seconds by which a
        sliding window is moved across the source soundfile
        before a spectrogram is created.
          
        
        @param in_dir_or_spectro_file: location of soundfile root
        @type in_dir_or_spectro_file: str
        @param outdir: root of spectrograms/wav_files to create
        @type outdir: src
        @param specific_species: process only a spectific list of species
        @type specific_species: {None | [str]}
        @param overwrite_policy: what to do when an output file already exists
        @type overwrite_policy: WhenAlreadyDone
        '''

        self.in_dir         	= in_dir_or_spectro_file
        self.out_dir        	= outdir
        self.specific_species   = specific_species
        self.overwrite_policy   = overwrite_policy
        self.generate_wav_files = generate_wav_files
        
        self.log = LoggingService()
        
        self.num_chopped = 0

        # Don't show the annoying deprecation
        # librosa.display() warnings about renaming
        # 'basey' to 'base' to match matplotlib: 
        warnings.simplefilter("ignore", category=MatplotlibDeprecationWarning)
        
        # Hide the UserWarning: PySoundFile failed. Trying audioread instead.
        warnings.filterwarnings(action="ignore",
                                message="PySoundFile failed. Trying audioread instead.",
                                category=UserWarning, 
                                module='', 
                                lineno=0)

        if self.specific_species is None:
            self.species_list = os.listdir(self.in_dir)
        else:
            self.species_list = self.specific_species
        
        # Create directories for new audio snippets
        # and spectrograms:
        
        self.wav_dir_path, self.spectrogram_dir_path = self.create_dest_dirs(self.species_list)
        
        # Allow others outside the instance
        # find the audio snippet destination
        SpectrogramChopper.wav_dir_path = self.wav_dir_path
        SpectrogramChopper.spectrogram_dir_path = self.spectrogram_dir_path

    #------------------------------------
    # chop_all
    #-------------------

    def chop_all(self):
        '''
        Workhorse: Assuming self.in_dir is root of all
        species audio samples:
        
                        self.in_dir
                        
          Species1        Species2   ...     Speciesn
           smpl1_1.mp3      smpl2_1.mp3         smpln_1.mp3
           smpl1_2.mp3      smpl2_2.mp3         smpln_2mp3
                            ...
                            
        Chops each .mp3 (or .wav) file into window_len snippets.
        Saves those snippets in a new directory. Creates a spectrogram 
        for each snippet, and saves those in a different, new directory.
        
        Resulting directories under self.out_dir will be:
         
                         self.out_dir
            spectrograms               wav-files
            
        If self.specific_species is None, audio files under all
        species are chopped. Else, self.specific_species is 
        expected to be a list of species names that correspond
        to the names of species directories above: Species1, Species2, etc.
        
        Returns a 2-tuple: (number of created .wav audio snippet files,
                            number of created .png spectrogram snippet files,
        
        '''
        for species in self.species_list:
            audio_files = os.listdir(os.path.join(self.in_dir, species))
            num_files   = len(audio_files)
            for i, sample_name in enumerate(audio_files):
                # Chop one audio file:
                self.log.info(f"Chopping {species} audio {i}/{num_files}")
                self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir)
            self.num_chopped += num_files

        num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png')
        num_audios   = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav')
        return (num_audios, num_spectros)
    
    #------------------------------------
    # chop_from_file_list 
    #-------------------
    
    def chop_from_file_list(self, assignments, return_bool, env=None):
        '''
        Takes a list like:
        
           [(s1,f1),(s1,f2),(s4,f3)]
           
        where s_n is a species name, and f_m
        is the basename of an audio file to chop.
        Example: foobar.mp3
        
        Returns True if all went well, else
        raises exception.
        
        Wrinkle: this method is called under two 
        very different scenarios (S1/S2). S1 is
        when the process started by the user calls
        this method. That happens when the command
        line arg --workers is set to 1, or on a machine
        where few enough cores are available that only
        one is used. In that case, env is left at None,
        and all is as normal.
        
        S2 occurs when the initial process (the one started
        from the command line) starts a new Process. That
        process normally contains a new environment, i.e. 
        some default value for all the environment variables.
        In particular, DISPLAY and PYTHONPATH will not be
        what is needed. The result is that all spectrogram
        creating methods fail, because they cannot find a
        graphics backend. 
        
        In that case kwarg env is set to the environment of the 
        initiating process. At the start of this method this
        process' default environ is then set to match that
        of the initiating process.
        
        :param assignments: list of species/filename pairs
        :type assignments: [(str,str)]
        :param env: if provided, the environment of the
            parent process. If None, the current env
            is retained
        :type env: {str : Any}
        :param return_bool:
        :type return_bool:
        '''
       
        # During multiprocessing this method is
        # the target, i.e. the entry point for 
        # each child. In that case env will be 
        # the environment of the initiating process.
        # We adopt that environment for this new,
        # forked process as well:
        
        if env is not None:
            os.environ = env

        for species_name, fname in assignments:
            try:
                self.chop_one_audio_file(self.in_dir,
                                         species_name,
                                         fname,
                                         self.out_dir
                                         )
            except Exception as e:
                return_bool.value = False
                raise e
            
        return_bool.value = True
    
    #------------------------------------
    # chop_one_audio_file 
    #-------------------

    def chop_one_audio_file(self, in_dir, species, spectro_fname, out_dir, window_len = 5):
        """
        Generates window_len second sound file snippets
        and associated spectrograms from sound files of
        arbitrary length. 
        
        Performs a time shift on all the wav files in the 
        species directories. The shift is 'rolling' such that
        no information is lost.
    
        :param in_dir: directory of the audio file to chop 
        :type file_name: str
        :param species: the directory names of the species to 
            modify the wav files of. If species=None, all 
            subdirectories will be processed.
        :type species: {None | [str]}
        :param spectro_fname: basefile name of audio file to chop
        :type spectro_fname: str
        :param out_dir: root directory under which spectrogram
            and audio snippets will be saved (in different subdirs)
        :type out_dir: str
        """

        orig, sample_rate = librosa.load(os.path.join(in_dir, species, spectro_fname))
        length = int(librosa.get_duration(orig, sample_rate))
        for start_time in range(length - window_len):
            fpath = Path(spectro_fname)
            window_name = f"{fpath.stem}_sw-start{str(start_time)}"
            window_file_name = str(Path.joinpath(fpath.parent, window_name))

            outfile_spectro = os.path.join(out_dir, 
                                           'spectrograms/', 
                                           species,
                                           f"{window_file_name}.png")
            
            outfile_audio = os.path.join(out_dir, 
                                         'wav-files', 
                                         species, 
                                         f"{window_file_name}.{'wav'}")
            
            
            spectro_done = os.path.exists(outfile_spectro)
            audio_done   = os.path.exists(outfile_audio)

            if spectro_done and audio_done and WhenAlreadyDone.SKIP:
                # No brainer no need to even read the audio excerpt:
                continue
            
            if spectro_done and not audio_done and not self.generate_wav_files:
                continue

            # Need an audio snippet either for
            # a spectrogram or wav file:
            window_audio, sr = librosa.load(os.path.join(in_dir, species, spectro_fname),
                                      offset=start_time, duration=window_len)

            if not spectro_done or (spectro_done and self.overwrite_policy != WhenAlreadyDone.SKIP):
                SoundProcessor.create_spectrogram(window_audio,sr,outfile_spectro)
            

            if self.generate_wav_files:
                if audio_done and self.overwrite_policy == WhenAlreadyDone.SKIP:
                    continue 
                else:
                    sf.write(outfile_audio, window_audio, sr)

    #------------------------------------
    # create_dest_dirs 
    #-------------------

    def create_dest_dirs(self, species_list):
        '''
        Creates all directories that will hold new 
        audio snippets and spectrograms for each species.
        For each directory: if dir exists:
           o if overwrite_policy is True, wipe the dir
           o else ask user. 
                If response is Yes, wipe the dir
                else raise FileExistsError
                
        :param species_list: names of species to process
        :type species_list: [str]
        :return: top level dirs for audio snippets and spectrograms
        :rtype: (str)
        :raise FileExistsError: if a dest dir exists and not allowed
            to wipe it.
        '''

        # Root dir of the two dirs that will hold new 
        # audio snippet and spectrogram files
        utils.create_folder(self.out_dir, overwrite_policy=self.overwrite_policy)

        # Below the rootP
        spectrogram_dir_path = os.path.join(self.out_dir,'spectrograms/')
        wav_dir_path = os.path.join(self.out_dir,'wav-files/')

        if not utils.create_folder(spectrogram_dir_path, overwrite_policy=self.overwrite_policy):
            raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting")
        if not utils.create_folder(wav_dir_path, overwrite_policy=self.overwrite_policy):
            raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting")
        
        # One dir each for the audio and spectrogram
        # snippets of one species:
        
        for species in species_list:
            species_spectros_dir = os.path.join(spectrogram_dir_path, species)
            if not utils.create_folder(species_spectros_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(f"Target dir {species_spectros_dir} exists; aborting")
            
            species_audio_dir = os.path.join(wav_dir_path, species)
            if not utils.create_folder(species_audio_dir,
                                       overwrite_policy=self.overwrite_policy):
                raise FileExistsError(f"Target dir {species_audio_dir} exists; aborting")

        return(wav_dir_path, spectrogram_dir_path)

    # -------------------- Class Methods ------------
    
    #------------------------------------
    # compute_worker_assignments 
    #-------------------
    
    @classmethod
    def compute_worker_assignments(cls, in_dir, num_workers=None):
        '''
        Given the root directory of a set of
        directories whose names are species,
        and which contain recordings by species,
        return a multi processing worker assignment.
        
        Expected:
                         in_dir

          Species1        Species2   ...     Speciesn
           smpl1_1.mp3      smpl2_1.mp3         smpln_1.mp3
           smpl1_2.mp3      smpl2_2.mp3         smpln_2mp3
                            ...
        
        Collects number of recordings available for
        each species. Creates a list of species name
        buckets such that all workers asked to process
        one of the buckets, will have roughly equal
        amounts of work.
        
        Example return:
            
            [['Species1', 'Species2], ['Species3', 'Species4', 'Species5']]
            
        The caller can then assign the first list to
        one worker, and the second list to another worker.
        
        The number of buckets, and therefore the number
        of eventual workers may be passed in. If None, 
        80% of the cores available on the current machine
        will be assumed. If num_workers is provided and
        the number is larger than the number of available
        cores, the number is reduced to the number of cores.
        
        Also returned is the number of workers on which the
        computation is based. This number is always the same
        as the number of species name lists in the return.
        But for clarity, the number is returned explicitly.

        :param in_dir: root of species recordings
        :type in_dir: str
        :param num_workers: number of buckets into which to partition 
        :type num_workers: {int | None}
        :return: list of species name lists, and number of workers.
        :rtype: ([[int]], int)
        '''

        # Create:
        #     {species : num-recordings}
        #     {species : recordings_dir}
        #     [(species1, fpath1), (species1, fpath2), (species2, fpath3)...]  
        
        sample_size_distrib = OrderedDict({})
        sample_dir_dict     = {}
        species_file_tuples = []
        
        for _dir_name, subdir_list, _file_list in os.walk(in_dir):
            for species_name in subdir_list:
                species_recordings_dir = os.path.join(in_dir, species_name)
                rec_paths = os.listdir(species_recordings_dir)
                sample_size_distrib[species_name] = len(rec_paths)
                sample_dir_dict[species_name] = species_recordings_dir
                species_file_pairs = list(zip([species_name]*len(rec_paths), rec_paths))
                species_file_tuples.extend(species_file_pairs)
            break 
        
        num_cores = mp.cpu_count()
        # Use 80% of the cores:
        if num_workers is None:
            num_workers = round(num_cores * SpectrogramChopper.MAX_PERC_OF_CORES_TO_USE  / 100)
        elif num_workers > num_cores:
            # Limit pool size to number of cores:
            num_workers = num_cores

        # Create a partitioning into equal sized files,
        # regardless of species association.
        
        assignments = cls.partition_by_recordings(species_file_tuples,
                                                  num_workers)
        num_workers_used = len(assignments)
        return assignments, num_workers_used

    #------------------------------------
    # partition_by_recordings 
    #-------------------
    
    @classmethod
    def partition_by_recordings(cls, species_file_pairs, num_workers):
        '''
        Given a list of species-name/file-path tuples, 
        partition that list into num_workers sublists,
        such that each list contains roughly the same
        number of tuples. If the number of species_file_pairs
        tuples is not divisible by num_workers, the left-over
        tuples are distributed over the first sublists.

        :param species_file_pairs:
        :type species_file_pairs:
        :param num_workers:
        :type num_workers:
        :return partitioning of the species_file_pairs tuples
        :rtype: [[(str, str)]]
        '''

        # Compute near-equal number of files per worker:
        num_recordings  = len(species_file_pairs)
        recs_per_worker = int(np.ceil(num_recordings / num_workers))
        
        # Create list of species-file pair lists:
        #    [[(s1,f1), (s1,f2)], [s1,f3,s2:f4], ...]
        # Each inner list will be handled by one worker:
        
        assignments = []
        assign_idx  = 0
        for _worker_idx in range(num_workers):
            assign_sublist = species_file_pairs[assign_idx:assign_idx+recs_per_worker]
            assignments.append(assign_sublist)
            assign_idx += recs_per_worker
        
        left_overs = num_recordings % num_workers
        if left_overs > 0:
            # Can't have more than num_workers left overs,
            # meaning can't have more leftovers than
            # sublists. Distribute the leftovers:=
             
            for idx, left_over in enumerate(species_file_pairs[-left_overs:]):
                assignments[idx].append(left_over)
        
        # Remove empty assignments:
        assignments = [ass for ass in assignments if len(ass) > 0]
        return assignments

    #------------------------------------
    # run_workers 
    #-------------------
    
    @classmethod
    def run_workers(cls, args, overwrite_policy=WhenAlreadyDone.ASK):
        '''
        Called by main to run the SpectrogramChopper in
        multiple processes at once. Pajcrtitions the
        audio files to be processed; runs the chopping
        while giving visual progress on terminal.
        
        Prints success/failure of each worker. Then
        returns

        :param args: all arguments provided to argparse
        :type args: {str : Any}
        '''
        
        in_dir = args.input
    
        # Get a list of lists of species names
        # to process. The list is computed such
        # that each worker has roughly the same
        # number of recordings to chop. We let
        # the method determine the number of workers
        # by using 80% of the available cores. 
        
        (worker_assignments, num_workers) = SpectrogramChopper.compute_worker_assignments(
            in_dir,
            num_workers=args.workers)
    
        print(f"Distributing workload across {num_workers} workers.")
        # Assign each list of species to one worker:
        
        chopping_jobs = []
        for ass_num, assignment in enumerate(worker_assignments):
            chopper = SpectrogramChopper(in_dir, 
                                   args.output_dir,
                                   overwrite_policy=overwrite_policy
                                   )
            ret_value_slot = mp.Value("b", False)
            job = ProcessWithoutWarnings(target=chopper.chop_from_file_list,
                                         args=([assignment, ret_value_slot]),
                                         name=f"ass# {ass_num}"
                                         )
            job.ret_val = ret_value_slot
            
            chopping_jobs.append(job)
            print(f"Starting chops for {job.name}")
            job.start()
        
        for job in chopping_jobs:
            job_done = False
            while not job_done:
                # Check for job done with one sec timeout: 
                job.join(1)
                # Get number of generated snippets:
                num_chopped_snippets = \
                    len(utils.find_in_dir_tree(SpectrogramChopper.spectrogram_dir_path))
                # Keep printing number of done snippets in the same
                # terminal line:
                print(f"Number of audio snippets: {num_chopped_snippets}", end='\r')
                # If the call to join() timed out
                if job.exitcode is None:
                    # Job not done:
                    continue
                res = "OK" if job.ret_val else "Error"
                # New line after the progress msgs:
                print("")
                print(f"Chops of {job.name}/{num_workers}: {res}")
                job_done = True
Exemplo n.º 15
0
class Charter:

    # Value to assign to precision,
    # recall, or f1 when divide by 0
    DIV_BY_ZERO = 0

    log = LoggingService()

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self, actions=None):

        if actions is None:
            return

        for action in actions:
            try:
                if type(action) == VizConfMatrixReq:
                    cm = Charter.read_conf_matrix_from_file(action.path)
                    fig = self.fig_from_conf_matrix(
                        cm,
                        supertitle=action.supertitle,
                        title=action.title,
                        write_in_fields=action.write_in_fields)
                    fig.show()

                elif action == 'pr_curves':
                    pass  #data = 0 #********************
            except Exception as _e:
                pass

    #------------------------------------
    # visualize_testing_result
    #-------------------

    @classmethod
    def visualize_testing_result(cls, truth_labels, pred_class_ids):
        '''
        Use to visualize results from using a 
        saved model on a set of test-set samples.
        
        Draws a PR curve, and adds a table with 
        the average precison (AP) of each class.
        '''
        # Find number of classes involved:
        all_class_ids = set(truth_labels)
        num_classes = len(all_class_ids)

        # Will alternately treat each class
        # prediction as a one-vs-all binary
        # classification. For each class ID (cid<n>),
        # get 0/1 guess separately for each sample:
        #
        #                 cid0      cid1
        #   pred_sample0   1          0
        #   pred_sample1   0          0
        #   pred_sample2   0          1
        #             ...
        # Same with labels:
        #                 cid0      cid1
        #   labl_sample0   1          0
        #   labl_sample1   0          0
        #   labl_sample2   0          1
        #             ...

        bin_labels = label_binarize(truth_labels,
                                    classes=list(range(num_classes)))

        # Make tensors just for manipulation
        # convenience:

        bin_labels_tn = torch.tensor(bin_labels)
        preds_tn = torch.tensor(pred_class_ids)

        precisions = dict()
        recalls = dict()
        average_precisions = dict()

        # Go through each column, i.e. the
        # 1/0 labels/preds for one class at
        # a time, and get the prec/rec numbers.
        # The [1] in prec & rec is b/c precision_recall_curve
        # returns a triplet for binary classification:
        # prec/rec at thresholds 0, 1, putting 1 as the
        # last element. The prec/rec we want is the
        # where 1 is the thresholds:

        for i in range(num_classes):

            bin_labels_arr = bin_labels_tn[:, i].tolist()
            preds_arr = preds_tn.tolist()

            # Get precision and recall at each
            # of the default thresholds:
            precs, recs = \
                cls.compute_binary_pr_curve(bin_labels_arr,
                                            preds_arr
                                            )
            precisions[i] = precs
            recalls[i] = recs

            # Avg prec is:
            #
            #      AP = SUM_ovr_n((R_n - R_n-1)*P_n
            #
            # I.e. the increase in recalls times current
            # precisions as each pred/sample pair is
            # processed:

            average_precisions[i] = \
                average_precision_score(bin_labels_arr,
                                        preds_arr,
                                        average='macro',
                                        )

        mAP = np.mean(list(average_precisions.values()))

        return (mAP, precisions, recalls, average_precisions)

# ----------------- Computations ---------------

#------------------------------------
# compute_binary_pr_curve
#-------------------

    @classmethod
    def compute_binary_pr_curve(
        cls,
        labels,
        preds,
        class_id,
        thresholds=None,
    ):
        '''
        Return the recall (x-axis) and precision (y-axis)
        values of a PR curve, its average precision (AP),
        and optimal threshold with corresponding f1, precision, 
        and recall values
        
        The optimal threshold's prec and rec yield the
        maximum f1 score. Information provided in the 
        BestOperatingPoint instance that is part of this
        method's return:
        
            threshold
            f1
            prec
            rec

        The result is packaged as a CurveSpecification
        that contains:
        
        	best_op_pt
        	precisions
        	recalls
        	thresholds
        	avg_prec'

        Procedure:
        
        A prec/rec point is computed for each 
        threshold point. 
        
        Works for binary classification.
        But can use sklearn's label_binaries to compute 
        separate curves for each class 
        (see compute_multiclass_pr_curves())
        
        Differs from sklearn.precision_recall_curve() in
        that the sklearn method does not take a list
        of thresholds.  
        
        Example:
        (preds are probabilities, but they
         are from one class, different samples.
         So dont' add to 1):
        
               labels  = [1,1,0,1]
               preds  = [0.2, 0.4, 0.1, 0.2] 
          thresholds  = [0.3, 0.7]
          
          The predictions are turned into decisions like this:
               preds_decided_0.3 = [0, 1, 0, 0]
               preds_decided_0.5 = [0, 0, 0, 0]
          
          Two prec and rec computations are executed:
          
            pr0:  prec and rec from [1, 1, 0, 1] 
                                    [0, 1, 0, 0]
        
            pr1:  prec and rec from [1, 1, 0, 1]
                                    [0, 0, 0, 0]

           resulting in:
              precs = [p0, p1]
              recs  = [r0, r1]

          F1 values fs = [f0, f1] are computed for p0/r0,
          and p1/r1. The position idx (argmax) of 
          the highest f1 is determined. 
          
          best_op_pt = {
             'threshold' : thresholds[idx], 
             'f1'        : fs[idx], 
             'prec'      : precs[idx] 
             'rec'       : recs[idx]
            }

          Finally the average precision (AP) is
          computed. It derives from precs and recs:
          
          for k=0 to k=n-1
          AP = sum_ovr_k((recs_k - recs_[k-1]) * preds_k)
          
          where n is number of thresholds, 
          recs_k and precs_k are precision and 
          recall at the kth threshold. By definition,
          preds_n = 1, recs_n = 0.

          Returned: a CurveSpecification instance
              containing:
                  best_op_pt
                  precisions
                  recalls
                  avg_prec

        :param labels: integer binary class labels.
            Exs.: [1,1,0,0], ['yes', 'yes', 'no', 'yes']
        :type labels: [int | str]
        :param preds: predictions output from a classifier.
            May be floats or integers
        :type preds: [float | int]
        :param class_id: ID of target class for which this
            curve is being constructed
        :type class_id: {int | str}
        :param thresholds: list of decision thresholds to
            decide whether preds are one class or the other.
            If None, uses [0.2, 0.4, 0.6, 0.8, 1]
        :type thresholds: [float | int]
        :return: CurveSpecification instances with optimal 
            operating point, and lists with prec and recall 
            ready for drawing a PR curve
        :rtype: CurveSpecification
        :raises ValueError if labels hold more than 
            two distinct values
        '''
        if type(labels) != list:
            labels = labels.tolist()

        uniq_classes = set(labels)

        if len(uniq_classes) > 2:
            raise ValueError(
                f"Labels limited to up to two distinct values; got {uniq_classes}"
            )

        if thresholds is None:
            thresholds = [0.2, 0.4, 0.6, 0.8]
        precisions = []
        recalls = []
        class_list = list(uniq_classes)
        # Degenerate case: Only a single
        # class ever occurs in the labels.
        # To make the code below work, we
        # add a copy of that only class to
        # the class list of known classes,
        # and log a warning:
        if len(class_list) == 1:
            cls.log.warn(
                f"Only label {class_list[0]} occurs; always guessing that value."
            )
        class_list.append(class_list[0])

        # So far, no undefined recall or precision
        # i.e. no 0-denominator found:
        undef_prec = False
        undef_rec = False
        undef_f1 = False

        for threshold in thresholds:
            y_pred = []
            for pred in preds:
                # Instead of just class_list[1],
                # must guard against only one
                # class (ID=0) in the labels.
                # In that special case, we always
                # predict 0:
                if pred >= threshold:
                    y_pred.append(class_list[1])
                else:
                    y_pred.append(class_list[0])

            y_pred_tn = torch.tensor(y_pred)

            # For 'No positive exist and classifier
            # properly doesn't predict a positive,
            # use:
            #      precision=1
            #      recall   =1
            # In this case prec and rec are undefined,
            # causing division by 0:

            try:

                with warnings.catch_warnings():
                    # Action to take: Ignore
                    warnings.filterwarnings(
                        "error",
                        #category=UndefinedMetricWarning,
                        category=UserWarning,
                    )
                    precision = precision_score(y_true=labels,
                                                y_pred=y_pred_tn,
                                                pos_label=class_list[1],
                                                zero_division='warn')
            except Exception as _e:
                # Was it a div by zero from the prec calc?
                undef_prec = True
                precision = Charter.DIV_BY_ZERO

            try:

                with warnings.catch_warnings():
                    # Action to take: Ignore
                    warnings.filterwarnings(
                        "error",
                        #category=UndefinedMetricWarning
                        category=UserWarning)
                    recall = recall_score(y_true=labels,
                                          y_pred=y_pred_tn,
                                          pos_label=class_list[1],
                                          zero_division=Charter.DIV_BY_ZERO)
            except Exception as _e:
                # Was it a div by zero from the prec calc?
                undef_rec = True
                recall = Charter.DIV_BY_ZERO

            precisions.append(precision)
            recalls.append(recall)

        precs_np = np.array(precisions)
        recs_np = np.array(recalls)

        with warnings.catch_warnings():
            try:
                warnings.filterwarnings(
                    "error",
                    #category=UndefinedMetricWarning
                    category=UserWarning)
                warnings.filterwarnings(
                    "true_divide",
                    #category=UndefinedMetricWarning
                    category=RuntimeWarning)
                f1_scores = 2 * (precs_np * recs_np) / (precs_np + recs_np)
            except Exception as _e:
                # Was it a div by zero from the prec calc?
                undef_f1 = True
                # When both prec and recall are 0,
                # set f1 to zero:
                f1_scores = torch.tensor([Charter.DIV_BY_ZERO] * len(precs_np))

        best_op_idx = np.argmax(f1_scores)

        best_operating_pt = BestOperatingPoint(thresholds[best_op_idx],
                                               f1_scores[best_op_idx],
                                               precisions[best_op_idx],
                                               recalls[best_op_idx])

        # To make average_precision computation
        # work:
        recs_np_padded = np.append(recs_np, [0])
        precs_np_padded = np.append(precs_np, [1])

        avg_precision = \
            np.sum((recs_np_padded[:-1] - recs_np_padded[1:]) * precs_np_padded[:-1])

        res = CurveSpecification(best_operating_pt,
                                 recs_np_padded,
                                 precs_np_padded,
                                 thresholds,
                                 avg_precision,
                                 class_id,
                                 undef_prec=undef_prec,
                                 undef_rec=undef_rec,
                                 undef_f1=undef_f1)

        return res

    #------------------------------------
    # compute_confusion_matrix
    #-------------------

    @classmethod
    def compute_confusion_matrix(cls,
                                 truth_labels,
                                 predicted_class_ids,
                                 class_names,
                                 normalize=False):
        '''
        Example Confustion matrix for 16 samples,
        in 3 classes:
        
                     C_1-pred, C_2-pred, C_3-pred
         C_1-true        3         1        0
         C_2-true        2         6        1
         C_3-true        0         0        3
        
        The number of classes is needed to let 
        sklearn to know even about classes that were not
        encountered.
        
        Assumption: self.class_names contains list 
        of class names, i.e. not the numeric IDs, but the
        ones to use when labeling the matrix.

        :param truth_labels: truth labels as list of class ids
        :type truth_labels: [int]
        :param predicted_class_ids: list of class_ids that were
            predicted, in same order as truth_labels
        :type predicted_class_ids: [int]
        :param class_names: list of class names as known to the
            user, i.e. not the numeric class ints. But the names
            to use as matrix labels in class id order!
        :type class_names: [str]
        :param normalize: whether or not to normalize ROWS
            to add to 1. I.e. turn cells into percentages
        :type normalize: bool
        :return: a dataframe of the confusion matrix; columns 
            and rows (i.e. index) set to class ids
        :rtype: pd.DataFrame 
        '''
        conf_matrix = torch.tensor(
            confusion_matrix(
                truth_labels,  # Truth
                predicted_class_ids,  # Prediction
                labels=list(range(len(class_names)))  # Numeric class ID labels
            ))

        if normalize:
            conf_matrix = cls.calc_conf_matrix_norm(conf_matrix)

        # Turn conf matrix from tensors to numpy, and
        # from there to a dataframe:
        conf_matrix_df = pd.DataFrame(conf_matrix.numpy(),
                                      index=class_names,
                                      columns=class_names)
        return conf_matrix_df

    #------------------------------------
    # calc_conf_matrix_norm
    #-------------------

    @classmethod
    def calc_conf_matrix_norm(cls, conf_matrix):
        '''
        Calculates a normalized confusion matrix.
        Normalizes by the number of samples that each 
        species contributed to the confusion matrix.
        Each cell in the returned matrix will be a
        percentage of the number of samples for the
        row. If no samples were present for a
        particular class, the respective cells will
        contain -1.
        
        It is assumed that rows correspond to the classes 
        truth labels, and cols to the classes of the
        predictions.
        
        :param conf_matrix: confusion matrix to normalize
        :type conf_matrix: {pd.DataFrame[int] | np.array | torch.Tensor}
        :returned a new confusion matrix with cells replaced
            by the percentage of time that cell's prediction
            was made. Cells of classes without any samples in
            the dataset will contain -1 
        :rtype matches input type
        '''

        # Get the sum of each row, which is the number
        # of samples in that row's class. Then divide
        # each element in the row by that num of samples
        # to get the percentage of predictions that ended
        # up in each cell:

        # When a class had no samples at all,
        # there will be divide-by-zero occurrences.
        # Suppress related warnings. The respective
        # cells will contain nan:

        with np.errstate(divide='ignore', invalid='ignore'):
            if type(conf_matrix) == np.ndarray:
                return sklearn.preprocessing.normalize(conf_matrix, norm='l1')
            elif type(conf_matrix) == torch.Tensor:
                return conf_matrix.true_divide(
                    torch.sum(conf_matrix, axis=1).unsqueeze(-1))
            elif type(conf_matrix) == pd.DataFrame:
                return conf_matrix.div(conf_matrix.sum(axis='columns'),
                                       axis='rows')
            else:
                raise TypeError(
                    f"Matrix must be a dataframe, numpy array, or tensor, not {type(conf_matrix)}"
                )

    #------------------------------------
    # compute_multiclass_pr_curves
    #-------------------

    @classmethod
    def compute_multiclass_pr_curves(cls,
                                     truth_labels,
                                     raw_preds,
                                     thresholds=[0.2, 0.4, 0.6, 0.8]):
        '''
        Computes the data needed to draw
        a family of PR curves for the results
        of multiclass classifier output.
        
        Returns a dict of the constituent 
        single-class curve specs, and a
        mean average precision (mAP) score
        for all curves combined.
        
        Each result dict maps a class ID
        to all info needed for one of the
        curves:

          1:
              {'best_op_pt' : best_operating_pt,
               'precisions' : precisions,
               'recalls'    : recalls,
               'thresholds' : thresholds,
               'avg_prec'   : avg_precision
               }
          2:
              {'best_op_pt' : best_operating_pt,
               'precisions' : precisions,
               'recalls'    : recalls,
               'thresholds' : thresholds,
               'avg_prec'   : avg_precision
               }

        where best_op_pt is:

               {'threshold' : <optimal decision probability value>
                'f1'        : <f1 at the optimal threshold>
                'prec'      : <precision at the optimal threshold>
                'thresholds' : thresholds,
                'rec'       : <recall at the optimal threshold>
                }

        Each of the avg_prec is the 
        the average of precisions across the 
        samples of one class (AP). I.e. there will
        be as many elements in average_precisions
        as there are classes. 
        
        The Mean Average Precision (mAP) is 
        the mean of the average_precision values.
        This measure summarizes the family of PR curves.
        It is comparable to AUC ROC.
        
        The precisions and recalls returns are dicts.
        The keys are class IDs, and the values are the
        precisions for that class. They are the quantities
        from which the average_precision values are 
        computed.
        
        Summary: 
            o precisions/recalls are the lowest granularity
              of information: the per class precs and recs
              at different thresholds.
              
              There are as many entries in these dicts as
              there are classes. And prec/rec value pair
              from the precisions and recalls dict are results
              of one threshold. 

               TODO: o finish this sentence by running and
                       seeing what's what
                     o A unit test for this method
                     o Finally: the actual drawing of the 
                         curves with pyplot
                         
            o average_precision aggregates the precisions
              of one class across multiple thresholds. There 
              will be as many entries in this dict as there 
              are classes.
              
            o mAP aggregates the average_precision values
              across all classes. This is one number.

        :param truth_labels: all truth labels shaped
            torch.Size([num-batches, batch-size])
        :type truth_labels: Tensor
        :param raw_preds: the logits for each class for
            each sample as 
            torch.Shape([num-batches, batch-size, num-classes])
        :type raw_preds: Tensor
        :return: (precisions, recalls, average_precisions, mAP)
        :rtype: ({int : [floats]}, {int : [floats]}, [floats], float)
        '''

        (num_batches, batch_size, num_classes) = raw_preds.shape

        num_samples = num_batches * batch_size

        # Will alternately treat each class
        # prediction as a one-vs-all binary
        # classification.
        #
        # Ex. let labels = [1,0,0,1,2]
        #      and preds = [0.3,0.6,0.1,0.7,0.9]
        #
        # Convert the labels to a one-hot vector;
        # the width of the binarized labels is
        # num_classes:
        #
        #       L A B E L S               P R E D S
        #       ------------              ----------
        #     [1,         [[0, 1, 0],       [0.3,
        #      0,          [1, 0, 0],        0.6,
        #      0,   ==>    [1, 0, 0],        0.1,
        #      1,          [0, 1, 0],        0.7,
        #      2]          [0, 0, 1]]        0.9]
        #
        # Then evaluate each label column vector
        # separately.

        bin_labels = label_binarize(truth_labels.flatten(),
                                    classes=list(range(num_classes)))

        assert (bin_labels.shape == torch.Size([num_samples, num_classes]))
        assert(raw_preds.shape == \
               torch.Size([num_batches, batch_size, num_classes])
               )

        # Want straight down: logits for each class, for
        # each sample ('lst' for 'list'):

        raw_preds_lst = raw_preds.reshape([num_samples, num_classes])

        assert (raw_preds_lst.shape == bin_labels.shape)

        # Turn logits into probs, rowise:
        preds = torch.softmax(raw_preds_lst, dim=1)

        # Place to hold the result dicts
        # from compute_binary_pr_curve()
        # for each of the classes. This
        # will be class-name : binary-result-dict

        all_curves_info = {}

        # Go through each column, class_id i.e. the
        # 1/0-vector label columns and preds
        # columns for one class at
        # a time, and get the prec/rec numbers.

        for col_idx in range(num_classes):
            bin_label_col = torch.tensor(bin_labels[:, col_idx])
            preds_col = preds[:, col_idx]

            # Get all info for this single, binary
            # classification: list of 1/0 labels, and
            # list of floats, which are the preds for
            # the current class:

            #**************
            # # Using sklearn's precision_recall_curve,
            # # which determines thresholds by its own
            # # algorithm:
            #
            # from sklearn.metrics import precision_recall_curve
            # sklearn_precs,\
            # sklearn_recs,\
            # sklearn_thresholds = \
            #     precision_recall_curve(bin_label_col, preds_col)
            #**************

            # Obtain the information needed to
            # draw one PR curve: a CurveSpecification
            # instance:
            one_class_curve = cls.compute_binary_pr_curve(
                bin_label_col,
                preds_col,
                col_idx,  # class_id
                thresholds)

            # Accumulate the curve indices
            # in a dict, keyed by class ID:
            all_curves_info[col_idx] = one_class_curve

        avg_precs = [
            binary_curve_info['avg_prec']
            for binary_curve_info in all_curves_info.values()
        ]
        mAP = np.mean(np.array(avg_precs)).tolist()

        return (all_curves_info, mAP)

# ----------------- Visualizations ---------------

#------------------------------------
# fig_from_conf_matrix
#-------------------

    @classmethod
    def fig_from_conf_matrix(cls,
                             conf_matrix,
                             supertitle='Confusion Matrix\n',
                             subtitle='',
                             write_in_fields=CELL_LABELING.DIAGONAL):
        '''
        Given a confusion matrix, return a 
        matplotlib.pyplot Figure with a heatmap of the matrix.
        
        The write_in_fields arg controls whether or not
        each cell is filled with a label indicating its
        value. If:
        
            o CELL_LABELING.ALWAYS    : always write the labels
            o CELL_LABELING.NEVER     : never write the labels
            o CELL_LABELING.DIAGONAL  : only label the diagonals
            o CELL_LABELING.AUTO      : only write labels if number of classes
                                        is <= CELL_LABELING.AUTO.value
                
        Result form:
        	             C_1-pred, C_2-pred, C_3-pred
        	 C_1-true        3         1        0
        	 C_2-true        2         6        1
        	 C_3-true        0         0        3
        	         
        
        :param conf_matrix: nxn confusion matrix representing
            rows:truth, cols:predicted for n classes
        :type conf_matrix: pd.DataFrame
        :param supertitle: main title at top of figure
        :type supertitle: str
        :param subtitle: title for the confusion matrix
            only. Ex: "data normalized to percentages"
        :type subtitle: str
        :param write_in_fields: how many cells, if any should 
            contain labels with the cell values. 
        :type write_in_fields: CELL_LABELING
        :return: matplotlib figure with confusion
            matrix heatmap.
        :rtype: pyplot.Figure
        '''

        if type(write_in_fields) != CELL_LABELING:
            raise TypeError(
                f"Arg write_in_fields must be a CELL_LABELING enum member, not {write_in_fields}"
            )

        class_names = conf_matrix.columns
        # Subplot 111: array of subplots has
        # 1 row, 1 col, and the requested axes
        # is in position 1 (1-based):
        # Need figsize=(10, 5) somewhere
        fig, ax = plt.subplots()
        # Make a copy of the cmap, so
        # we can modify it:
        cmap = copy.copy(col_map.Blues)

        fig.set_tight_layout(True)
        fig.suptitle(supertitle, fontsize='large', fontweight='extra bold')

        # Later matplotlib versions want us
        # to use the mticker axis tick locator
        # machinery:
        ax.xaxis.set_major_locator(mticker.MaxNLocator('auto'))
        ticks_loc = ax.get_xticks().tolist()
        ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
        ax.set_xticklabels([class_name for class_name in ticks_loc],
                           rotation=45)

        ax.yaxis.set_major_locator(mticker.MaxNLocator('auto'))
        ticks_loc = ax.get_yticks().tolist()
        ax.yaxis.set_major_locator(mticker.FixedLocator(ticks_loc))
        ax.set_yticklabels([class_name for class_name in ticks_loc])

        # Add cell labels if requested:
        if write_in_fields == CELL_LABELING.ALWAYS or \
                              (CELL_LABELING.AUTO and len(class_names) <= CELL_LABELING.AUTO.value):
            annot = conf_matrix.copy()
            mask = None
        elif write_in_fields == CELL_LABELING.DIAGONAL:
            # Fill with a copy of conf matrix with strings,
            # but room for up to 3 chars:
            #**************
            #annot = np.full_like(conf_matrix, '', dtype='U4')
            #np.fill_diagonal(annot, np.diag(conf_matrix).astype(str))
            #annot = np.empty_like(conf_matrix)
            #np.fill_diagonal(annot, np.diag(conf_matrix).astype(str))
            # Fill a new df with True, where df is same
            # dimensions as another df: annot:
            mask = pd.DataFrame(
                np.array([True] * annot.size).reshape(annot.shape))
            np.fill_diagonal(mask.values, False)

            #**************
        else:
            annot = None
            mask = None

        cmap.set_bad('gray')

        heatmap_ax = sns.heatmap(
            conf_matrix,
            cmap=cmap,
            square=True,
            annot=annot,  # Cell labels
            mask=mask,
            fmt='d',  # Round to integers
            cbar=True,  # Do draw color bar legend
            ax=ax,
            linewidths=1,  # Pixel,
            linecolor='gray',
            robust=True  # Compute colors from quantiles instead of 
            # most extreme values
        )

        # Add '%' after cell numbers; note that fmt='d%'
        # leads to an error; I suspect there is a seaborn
        # heatmap fmt value that would add '%', but I don't
        # have time for frigging format strings:

        for txt in heatmap_ax.texts:
            txt.set_text(txt.get_text() + " %")

        heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), rotation=45)

        heatmap_ax.set_title(
            subtitle,
            fontdict={
                'fontsize': 'medium',
                'fontweight': 'bold',
            },
            pad=12  # Distance above matrix in pt
        )

        # Label x and y to clarify what's predicted,
        # and what's truth; also: have the x-axis label
        # at the top:

        heatmap_ax.set_xlabel('True Classes', fontweight='bold')
        heatmap_ax.xaxis.set_label_position('top')
        heatmap_ax.set_ylabel('Predicted Classes', fontweight='bold')

        fig = heatmap_ax.get_figure()
        fig.suptitle(supertitle, fontsize='large', fontweight='extra bold')

        return fig

# -------------------- Utilities for Charter Class --------------

    @classmethod
    def read_conf_matrix_from_file(cls, cm_path):
        '''
        Read a previously computed confusion matrix from
        file. Return a dataframe containing the cm. 
        
        Depending on the original dataframe/tensor,np_array
        from which which the .csv was created, the first line
        has a leading comma. This results in:
        
              Unnamed: 0  foo  bar  fum
            0        foo    1    2    3
            1        bar    4    5    6
            2        fum    7    8    9        
        
        Rather than the correct:
        
                 foo  bar  fum
            foo    1    2    3
            bar    4    5    6
            fum    7    8    9

        Since conf matrices are square, we can check
        and correct for that.
        
        NOTE: if arrays of predicted and truth classes are
               available, rather than an already computed confusion
               matrix saved to file, see compute_confusion_matrix(). 
        
        :param cm_path: path to confusion matrix in csv format
        :type cm_path: str
        :return: confusion matrix as dataframe; no processing on numbers
        :rtype: pd.DataFrame
        '''

        df = pd.read_csv(cm_path)
        # If comma was missing, we have one fewer
        # col names than row names:
        if len(df.columns) != len(df.index):
            df_good = df.iloc[:, 1:]
            df_good.index = df.columns[1:]
        else:
            df_good = df

        return df_good
Exemplo n.º 16
0
class ModelArchive:
    '''
    classdocs
    '''

    #------------------------------------
    # Constructor 
    #-------------------

    def __init__(self, 
                 config, 
                 num_classes,
                 history_len=8,
                 model_root=None,
                 log=None):
        '''
        Constructor:
        
        :param config: configuration structure
        :type config: NeuralNetConfig
        :param num_classes: number of target classes
        :type num_classes: int
        :param history_len: number of model snapshots to 
            maintain
        :type history_len: int
        :param model_root: path to where models
            will be deposited
        :type model_root: str
        :param log: logging service to use. If
            None, create new one for display output
        :type log: LoggingService
        '''

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))
        
        # Model root directory:
        if model_root is None:
            self.model_root = os.path.abspath(
                os.path.join(self.curr_dir, 
                             '../runs_models')
                )
        else:
            self.model_root = model_root

        if os.path.exists(self.model_root) and \
                not os.path.isdir(self.model_root):
            raise FileExistsError(f"{self.model_root} exists but is not a directory")

        # Ensure that intermediate dirs exist:
        try:
            os.makedirs(self.model_root)
        except FileExistsError:
            pass

        if log is None:
            self.log = LoggingService()
        else:
            self.log = log
            
        self.history_len = history_len

        # Create a subdirectory of model_root
        # where this archive keeps its models.
        # The subdir is guaranteed to be unique
        # among model_root's siblings, and it will
        # be created:
        
        self.run_subdir = self._construct_run_subdir(config, 
                                                    num_classes,
                                                    self.model_root)

        # Queue to track models, keeping the 
        # number of saved models to history_len:
        
        self.model_fnames = deque(maxlen=self.history_len)
        
    #------------------------------------
    # save_model 
    #-------------------
    
    def save_model(self, model, epoch):
        '''
        Saves and retains trained models
        on disk. 
        
        Within a subdir the method maintains a queue
        of files of len history_len: 
        
                 fname_1_ep_0.pth
                 fname_2_ep_1.pth
                      ...
                 fname_<history_len>.pth
        
        where ep_<n> is the epoch during training
        where the model of that moment is being 
        saved.
        
        When history_len model files are already present, 
        removes the oldest.
        
        Assumptions: 
            o self.fname_els_dict contains prop/value
              pairs for use in FileUtils.construct_filename()
                 {'bs' : 32,
                  'lr' : 0.001,
                     ...
                 }
            o self model_fnames is a deque the size of
              which indicates how many models to save
              before discarding the oldest one as new
              ones are added
                 
        :param model: model to save
        :type model: nn.module
        :param epoch: the epoch that created the model
        :type epoch: int
        :param history_len: number of snapshot to retain
        :type history_len: int
        '''
        
        deque_len = len(self.model_fnames)
        if deque_len >= self.history_len:
            # Pushing a new model fname to the
            # front will pop the oldest from the
            # end. That file needs to be deleted:
            oldest_model_path = self.model_fnames[-1]
        else:
            # No file will need to be deleted.
            # Still filling our allotment:
            oldest_model_path = None
            
        model_fname = FileUtils.construct_filename(self.fname_els_dict,
                                                   prefix='mod', 
                                                   suffix=f"_ep{epoch}.pth", 
                                                   incl_date=True)
        
        model_path = os.path.join(self.run_subdir, model_fname)
        
        # As recommended by pytorch, save the
        # state_dict for portability:
        torch.save(model.state_dict(), model_path)

        self.model_fnames.appendleft(model_path)
        
        if oldest_model_path is not None:
            try:
                os.remove(oldest_model_path)
            except Exception as e:
                self.log.warn(f"Could not remove old model: {repr(e)}")


    #------------------------------------
    # restore_model 
    #-------------------
    
    def restore_model(self, model_path, config=None):
        '''
        Given the path to a saved model, 
        load and return it. The saved file
        is the saved model's state_dict. 
        So, the method must first create a
        model instance of the correct type.
        Then the state is loaded into that
        instance.
        
        :param model_path:
        :type model_path:
        :param config: a config structure that will be
            use to decide which model class to instantiate.
            If None, attempts to reconstruct the 
            information from the model_path.
        :type config: NeuralNetConfig
        :return: loaded model
        :rtype: torch.nn.module
        '''
        
        if config is None:
            model = self._instantiate_model(config=config)
        else:
            model = self._instantiate_model(run_path_str=model_path)
         
        model.load_state_dict(torch.load(model_path))
        return model

    #------------------------------------
    # _instantiate_model 
    #-------------------
    
    def _instantiate_model(self, run_path_str=None, config=None):
        '''
        Returns a model based on information in 
        the config structure, or the info encoded
        in the run_path_str file name. 
        
        One of run_path_str or config must be non-None.
        If both are non-None, uses config.
        
        File paths that encode run parameters look like
        this horror:
        
        model_2021-03-11T10_59_02_net_resnet18_pretrain_0_lr_0.01_opt_SGD_bs_64_ks_7_folds_0_gray_True_classes_10.pth 
        
        :param run_path_str: a path name associated with
            a model. 
        :type run_path_str:
        :param config: run configuration structure 
        :type config: NeuralNetConfig
        :return: a model 
        :rtype: torch.nn.module
        '''
        if config is None:
            # Get a dict with info 
            # in a standard (horrible) file name:
            fname_props = FileUtils.parse_filename(run_path_str)
        else:
            fname_props = config.Training
            data_root   = config.Paths.root_train_test_data
            class_names = FileUtils.find_class_names(data_root)
            fname_props['classes'] = len(class_names)
            fname_props['pretrain'] = config.Training.getint('freeze', 0)
        
        model = NetUtils.get_net(net_name=fname_props['net_name'],
                                 num_classes=fname_props['classes'],
                                 freeze=fname_props['pretrain'],
                                 to_grayscale=fname_props['to_grayscale']
                                 )
        return model

# ---------------- Utils -------------

    #------------------------------------
    # _construct_run_subdir 
    #-------------------
    
    def _construct_run_subdir(self, 
                             config, 
                             num_classes, 
                             model_root):
        '''
        Constructs a directory name composed of
        elements specified in utility.py's 
        FileUtils file/config info dicts.
        
        Ensures that <model_root>/subdir_name does
        not exist. If it does, keeps adding '_r<n>'
        to the end of the dir name.
        
        Final str will look like this:
        
        model_2021-03-23T15_38_39_net_resnet18_pre_True_frz_6_bs_2_folds_5_opt_SGD_ks_7_lr_0.01_gray_False
            
        Details will depend on the passed in 
        configuration.

        Instance var fname_els_dict will contain 
        all run attr/values needed for calls to 
        FileUtils.construct_filename() 
        
        :param config: run configuration
        :type config: NeuralNetConfig
        :param num_classes: number of target classes 
        :type num_classes: int
        :param model_root: full path to dir where the
            subdir is to be created
        :type model_root: str
        :return: unique subdir name of self.model_root,
            which has been created
        :rtype: str
        '''

        # Using config, gather run-property/value 
        # pairs to include in the dir name:
         
        fname_els_dict = {}
        
        section_dict   = config.Training 
        
        for el_name, el_abbr in FileUtils.fname_long_2_short.items():
            
            el_type = FileUtils.fname_el_types[el_abbr]
            
            if el_type == int:
                fname_els_dict[el_name] = section_dict.getint(el_name)
            elif el_type == str:
                fname_els_dict[el_name] = section_dict.get(el_name)
            elif el_type == float:
                fname_els_dict[el_name] = section_dict.getfloat(el_name)
            elif el_type == bool:
                fname_els_dict[el_name] = section_dict.getboolean(el_name)
            elif callable(el_type):
                # A lambda or func. Apply it:
                fname_els_dict[el_name] = el_type(section_dict[el_name])

        fname_els_dict['num_classes'] = num_classes

        # Save this root name:
        self.fname_els_dict = fname_els_dict

        # Get the subdir name (without leading path):
        dir_basename = FileUtils.construct_filename(
            fname_els_dict,
            prefix='models',
            suffix=None, 
            incl_date=True)
        
        final_dir_path = os.path.join(model_root, dir_basename)
        
        # Disambiguate by appending '_r<n>' as needed: 
        disambiguation = 1
        while os.path.exists(final_dir_path):
            new_basename = f"{dir_basename}_r{disambiguation}"
            final_dir_path = os.path.join(model_root, new_basename)
            disambiguation += 1

        os.makedirs(final_dir_path)
        
        return final_dir_path 
Exemplo n.º 17
0
class CrossValidatingDataLoader(DataLoader):
    '''
    
    Subclass of torch.utils.data.DataLoader. Provides
    stratified k-fold crossvalidation in single-machine,
    (optionally) single-GPU context.
    
    Instantiate this class if running only on a
    single machine, optionally using a single GPU. Else,
    instantiate the MultiprocessingDataLoader subclass 
    instead.
    
    An instance of this class wraps any dict-API dataset instance, 
    which provides tuples , for instance (<img-tensor>, class-label-int) 
    from the file system when given a sample ID.
    
    This subclass of torch.utils.data.DataLoader specilizes
    the default by using a stratified k-fold cross validation
    sampler. That underlying sampler manages partitioning of
    samples into folds, and successively feeding samples from
    the training folds. The sampler also manages the 'switching out'
    of folds to take the role of test fold in round robin fashion.
        
    This DataLoader instance also managing combination of 
    samples into batches.
    
    An instance of this class presents an iterator API, additionally
    serving the test samples whenever one set of train folds are 
    exhausted. Example: assume 
          
          o k-fold cross validation k = 5
        
          for split in range(k):
          
              for batch in my_dataloader:
                  try:
                      <feed training batch to emerging model>
                  except EndOfSplit as e:
                      print(e.message) # Just for debugging
                      break
                  
              # Exhausted all train folds of one split
              # Now test current state of the 
              # model using this split's test samples,
              # which are available as an iterator from the
              # dataloader:
              
              for (img_tensor, label) in my_ataloader.validation_samples():
                  <test model on img_tensor>
         
              # next split
              
    The validation_samples() method is a generator that provides the content of 
    the just exhausted split's validation samples.
    
    NOTE: when re-setting an instance of this class
          for a new epoch, client must call set_epoch()
          with the new epoch number to ensure proper
          shuffling randomness. Such a reset occurs implicitly
          with the often used idiom:
               
                for i,res = enumerate(dataloader)
        
          The enumerate() starts the same dataloader instance
          from the beginning. 
          
          If shuffle is False, set_epoch() needs not be called.
          But doing so does no harm.
          
    '''

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 dataset,
                 batch_size=32,
                 shuffle=True,
                 seed=42,
                 num_workers=0,
                 pin_memory=False,
                 prefetch_factor=2,
                 drop_last=True,
                 num_folds=10,
                 sampler=None,
                 logger=None):
        '''
        This instance will use cross validation
        as it serves out samples. The client determines
        the number of folds to use. Example for 
        num_folds of 2:
        
         Split1:
           TrainFold1    TrainFold2   ValidationFold  
            sample1      sample2        sample3
            sample4      sample5        sample6

         Split2:
           TrainFold1    TrainFold2   ValidationFold  
            sample3      sample4        sample2
            sample1      sample6        sample5
            
        This dataloader will create two sequences,
        like this:
        
           For use with training:   [sample1, sample4, sample2, sample5]
           For use with validation: [sample4, sample6]
             after the training 
             sequence is used up

        Assuming batch_size of two, this dataloader's
        client will receive one row from each 
        call to next():
        
            [[sample1, sample4],
             [sample2, sample5],
             [None   , None]
             ]
             
        The None tuple indicates that this split has
        been exhausted, and it is time to validate.
        
        The client then calls validation_samples() on
        this dataloader instance to receive one validation
        sample at a time. The client will predict the
        (target) class for each of these validation samples,
        and tally successes and failures. The client should
        then compute the compute validation accuracy from
        that series of successes and failures. 

        Calling next() again will create a new split,
        and again feed out the samples in the respective
        new folds.
        
        The feed terminates after as many splits as there
        are folds. Any following call to next() will raise
        a StopIteration exception.

        :param dataset: underlying map-store that 
                supplies(img_torch, label) tuples
        :type dataset: BirdDataset
        :param batch_size: number of samples to combine into 
            a batch to feed model during training
        :type batch_size: int
        :param shuffle: whether or not to shuffle the
            dataset once, initially.
        :type shuffle: bool
        :param seed: random seed to use if shuffle is True
        :type shuffle: int
        :param num_workers: number of threads used to preload
        :type num_workers: int
        :param pin_memory: set to True if using a GPU. Speeds
            transfer of tensors from CPU to GPU
        :type pin_memory: bool
        :param prefetch_factor: how many samples to prefetch from
            underlying database to speed access to file system
        :type prefetch_factor: int
        :param drop_last: whether or not to serve only partially 
            filled batches. Those occur when samples cannot be
            evenly packed into batches. 
        :type drop_last: bool
        :param num_folds: the 'k' in k-fold cross validation
        :type num_folds: int
        :param sampler: Only used when MultiprocessingDataLoader
            is being instantiated, and that class's __init__()
            calls super(). Leave out for singleprocess/single-GPU
            use
        :type sampler: {None | DistributedSKFSampler}
        :param logger: the LoggingService instance to use
            for logging info/warnings/errors. If None, fetches
            the LoggingService singleton.
        :type logger: LoggingService
        '''

        if len(dataset) == 0:
            raise ValueError("Dataset is empty, nothing to load")

        self.drop_last = drop_last

        if logger is None:
            self.log = LoggingService()
        else:
            self.log = logger

        # Sampler will only be set if a subclass instance
        # of MultiprocessingDataLoader is being initialized.
        # Else, running single process:

        if sampler is None:
            self.sampler = SKFSampler(dataset,
                                      num_folds=num_folds,
                                      shuffle=shuffle,
                                      drop_last=drop_last,
                                      seed=seed)
        else:
            self.sampler = sampler

        if not isinstance(batch_size, int) or batch_size <= 0:
            msg = f"Batch size must be a positive int, not "

            # Complete the error msg according which of
            # the two failure conditions occurred:
            msg += type(batch_size).__name__ if not isinstance(batch_size, int)\
                                           else f"{batch_size}"

            raise ValueError(msg)

        self.batch_size = batch_size

        self.num_folds = num_folds

        # Total num of batches served when
        # rotating through all folds is computed
        # the first time __len__() is called:

        self.num_batches = None
        self.curr_split_idx = -1

        super().__init__(dataset,
                         batch_size=batch_size,
                         sampler=self.sampler,
                         num_workers=num_workers,
                         pin_memory=pin_memory,
                         prefetch_factor=prefetch_factor,
                         drop_last=drop_last)

    #------------------------------------
    # __len__
    #-------------------

    def __len__(self):
        '''
        Number of batches this loader will
        feed out. Example:
            o 12 samples total
            o  3 folds
            o  2 batch size
            o  4 samples in each split (12/3)
            o  2 batches per split (samples-each-split / batch-size)
            o  3 number of trips through folds
            o  2 number of folds in each of the 3
                 trips (num-folds - hold-out-fold)
            o 12 batches total: batches-per-fold * folds-per-trip * num-folds 
                   2*2*3 = 12
        '''

        # Compute number of batches only once:
        if self.num_batches is None:

            # This computation can surely be more
            # concise and direct. But it happens only
            # once, and this step by step is easier
            # on the eyes than one minimal expression:
            num_samples = len(self.sampler)

            if num_samples == 0:
                raise ValueError("No samples to serve.")

            # Rounded-down number of samples that fit into each fold.
            # Having 34 samples with 3 folds, that is 34/3 == ~11

            samples_per_fold = num_samples // self.num_folds

            # For training we get 2 folds worth of samples,
            # with one fold held out: 11*2 = 22

            samples_per_split = samples_per_fold * (self.num_folds - 1)

            # As many permutations as there are folds: 3 * 22: 66

            total_train_samples = self.num_folds * samples_per_split

            # Convert to batches. Assume batch_size of 2:
            # 66 // 2 = 33

            self.total_num_batches = total_train_samples // self.batch_size
            if self.total_num_batches == 0:
                self.log.warn(
                    f"Not enough data ({total_train_samples}) for even one batch (of size {self.batch_size})"
                )

            remainder_samples = total_train_samples % self.batch_size
            if not self.drop_last and remainder_samples > 0:
                # Add the final partially filled batch,
                # if num_samples not a multiple of batches += 1
                self.total_num_batches += 1

        return self.total_num_batches

    # #------------------------------------
    # # __iter__
    # #-------------------
    #
    def __iter__(self):
        # Call to __next__() returns
        # a generator, which does the
        # right thing with next(), list(),
        # and for loops. Return that iterator:

        return (self.__next__())

    #------------------------------------
    # __next__
    #-------------------

    def __next__(self):

        # Loop over all splits (i.e. over all
        # configurations of which fold is for
        # validation.

        # Get one list of sample IDs that
        # covers all train samples in one split.
        # And one list of sample IDs that
        # are to be used for validation in this
        # split.

        # Raise EndOfSplit exception at the end of
        # each split, i.e. when client is to validate.
        # When all splits are exhausted, raise StopIteration.

        for split_train_idxs, split_test_idxs in self.sampler:

            # Keep track of which split we are working
            # on. Needed only as info for client; not
            # used for logic in this method:

            self.curr_split_idx += 1

            # split_train_idxs has all sample IDs
            # to use for training in this split.
            # The split_test_idxs holds the left-out
            # sample IDs to use for testing once
            # the split_train_idxs have been served out
            # one batch at a time.

            # Set this split's test sample ids aside for client
            # to retrieve later via: get_split_test_sample_ids()
            # once they pulled all the batches of this
            # split:
            self.curr_test_sample_ids = []
            for sample_idx in split_test_idxs:
                self.curr_test_sample_ids.append(
                    self.dataset.sample_id_by_sample_idx(sample_idx))

            # Create one batch:

            num_train_sample_ids = len(split_train_idxs)
            num_batches = num_train_sample_ids // self.batch_size
            num_remainder_samples = num_train_sample_ids % self.batch_size
            batch_start_idx = 0

            # Create num_batches batches from the
            # training data of this split:

            for _batch_count in range(num_batches):

                batch = None
                # Truth labels for each sample in
                # the current batch:
                y = []
                batch_end_idx = batch_start_idx + self.batch_size
                curr_batch_range = range(batch_start_idx, batch_end_idx)

                for train_sample_idx in curr_batch_range:

                    # Index into the current split's list
                    # of training sample ids:
                    sample_idx = split_train_idxs[train_sample_idx]
                    # Get one pair: <img-tensor>, class_id_int:
                    (img_tensor,
                     label) = self.dataset.sample_by_idx(sample_idx)
                    expanded_img_tensor = unsqueeze(img_tensor, dim=0)
                    batch = (cat((batch, expanded_img_tensor), dim=0)
                             if batch is not None else expanded_img_tensor)
                    y.append(label)

                # Got one batch ready:
                yield (batch, torch.tensor(y))

                # Client consumed one batch in current split.
                # Next batch: Starts another batch size
                # samples onwards in the train split:

                batch_start_idx += self.batch_size

                # Put together next batch:
                continue

            # Done all full batches. Any partial batch
            # left over that we should include?

            if num_remainder_samples > 0 and not self.drop_last:
                batch = None
                y = []

                for sample_id in range(batch_start_idx, batch_start_idx +
                                       num_remainder_samples):
                    (img_tensor, label) = self.dataset[sample_id]
                    expanded_img_tensor = unsqueeze(img_tensor, dim=0)
                    batch = (cat((batch, expanded_img_tensor))
                             if batch is not None else expanded_img_tensor)
                    y.append(label)
                yield (batch, torch.tensor(y))

            # Let client know that all batches for one split
            # have been delivered by a None/None pair:

            raise EndOfSplit()

            # Next split:
            continue

    #------------------------------------
    # get_curr_fold_idx
    #-------------------

    def get_curr_fold_idx(self):
        return self.curr_split_idx

    #------------------------------------
    # get_split_test_sample_ids
    #-------------------

    def get_split_test_sample_ids(self):
        try:
            return self.curr_test_sample_ids
        except:
            return None

    #------------------------------------
    # validation_samples
    #-------------------

    def validation_samples(self):
        '''
        Generator that runs through every
        test sample_id of the current fold, 
        and feeds (<img_tensor, label) pairs.
        
           for (img_tensor, label) in my_bird_dataloader.validation_samples():
               <test model>
        '''

        for sample_id in self.get_split_test_sample_ids():
            yield self.dataset[sample_id]

    #------------------------------------
    # file_from_sample_id
    #-------------------

    def file_from_sample_id(self, sample_id):
        '''
        Given a sample_id, return the absolute
        file path of the corresponding sample
        in the file system.
        
        We use the public dataset method.
        
        :param sample_id: sample ID to look up
        :type sample_id: int
        '''
        return self.dataset.file_from_sample_id(sample_id)

    #------------------------------------
    # class_from_sample_id
    #-------------------

    def class_from_sample_id(self, sample_id):
        '''
        Given a sample ID, return its class index.
        
        :param sample_id: ID to look up
        :type sample_id: int
        :return: given sample's class ID
        :rtype: int
        '''
        return self.dataset.sample_id_to_class[sample_id]

    #------------------------------------
    # set_epoch
    #-------------------

    def set_epoch(self, new_epoch):
        '''
        Must be called by client every time
        a new epoch starts. The epoch number
        is used by the sampler to shuffle
        the dataset before beginning to draw
        samples.

        :param new_epoch: the epoch under which the dataloader
            is (re)started
        :type new_epoch: int
        '''
        self.sampler.set_epoch(new_epoch)
Exemplo n.º 18
0
class BirdsBasicTrainerCV:
    '''
    classdocs
    '''
    # Number of intermediate models to save
    # during training:

    MODEL_ARCHIVE_SIZE = 20

    # For some tensorboard displays:
    # for how many epochs in the past
    # to display data:

    DISPLAY_HISTORY_LEN = 10

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 config_info,
                 device=0,
                 percentage=None,
                 debugging=False):
        '''
        
        :param config_info: all path and training parameters
        :type config_info: NeuralNetConfig
        :param debugging: output lots of debug info
        :type debugging: bool
        :param device: number of GPU to use; default is dev 0
            if any GPU is available
        :type device: {None | int}
        :param percentage: percentage of training data to 
            use
        :type percentage: {int | float}
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        if percentage is not None:
            # Integrity check:
            if type(percentage) not in [int, float]:
                raise TypeError(
                    f"Percentage must be int or float, not {type(percentage)}")
            if percentage < 1 or percentage > 100:
                raise ValueError(
                    f"Percentage must be between 1 and 100, not {percentage}")

        if device is None:
            device = 0
            torch.cuda.set_device(device)
        else:
            available_gpus = torch.cuda.device_count()
            if available_gpus == 0:
                self.log.info("No GPU available; running on CPU")
            else:
                if device > available_gpus - 1:
                    raise ValueError(
                        f"Asked to operate on device {device}, but only {available_gpus} are available"
                    )
                torch.cuda.set_device(device)

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.num_folds = self.config.Training.getint('num_folds')
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.device = self.fastest_device
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.initialize_model()

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)

        self.train_loader = self.get_dataloader(sample_width,
                                                sample_height,
                                                perc_data_to_use=percentage)
        self.log.info(f"Expecting {len(self.train_loader)} batches per epoch")
        num_train_samples = len(self.train_loader.dataset)
        num_classes = len(self.train_loader.dataset.class_names())
        self.log.info(
            f"Training set contains {num_train_samples} samples across {num_classes} classes"
        )

        self.class_names = self.train_loader.dataset.class_names()

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here: (num_folds * num-epochs)
        # each for training and validation steps.

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_step = self.train()
            self.visualize_final_epoch_results(final_step)
        finally:
            self.close_tensorboard()

    #------------------------------------
    # train
    #-------------------

    def train(self):

        overall_start_time = datetime.datetime.now()
        # Just for sanity: keep track
        # of number of batches...
        total_batch_num = 0

        # Note: since we are cross validating, the
        # data loader's set_epoch() method is only
        # called once (automatically) during instantiation
        # of the associated sampler. Moving from split
        # to split includes shuffling if the caller
        # specified that.

        # Training
        for split_num in range(self.train_loader.num_folds):

            split_start_time = datetime.datetime.now()
            self.initialize_model()
            for epoch in range(self.max_epochs):

                # Set model to train mode:
                self.model.train()

                epoch_start_time = datetime.datetime.now()

                self.log.info(f"Starting epoch {epoch} training")

                # Sanity check record: will record
                # how many samples from each class were
                # used:
                self.class_coverage = {}

                # Sanity records: will record number
                # of samples of each class that are used
                # during training and validation:
                label_distrib = {}
                batch_num = 0

                self.log.info(
                    f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}"
                )
                try:
                    for batch, targets in self.train_loader:
                        # Update the sanity check
                        # num of batches seen, and distribution
                        # of samples across classes:
                        batch_num += 1
                        total_batch_num += 1

                        # Update sanity check records:
                        for lbl in targets:
                            lbl = int(lbl)
                            try:
                                label_distrib[lbl] += 1
                            except KeyError:
                                label_distrib[lbl] = 1
                            try:
                                self.class_coverage[lbl]['train'] += 1
                            except KeyError:
                                self.class_coverage[lbl] = {
                                    'train': 1,
                                    'val': 0
                                }

                        self.log.debug(
                            f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')

                        outputs = self.model(images)
                        loss = self.loss_fn(outputs, labels)
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

                        # Remember the last batch's train result of this
                        # split (results for earlier batches of
                        # the same split will be overwritten). This statement
                        # must sit before deleting output and labels:

                        step_num = self.step_number(epoch, split_num,
                                                    self.num_folds)
                        self.remember_results(LearningPhase.TRAINING, step_num,
                                              outputs, labels, loss)

                        self.log.debug(
                            f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(images, 'cpu')
                        outputs = FileUtils.to_device(outputs, 'cpu')
                        labels = FileUtils.to_device(labels, 'cpu')
                        loss = FileUtils.to_device(loss, 'cpu')

                        del images
                        del outputs
                        del labels
                        del loss
                        torch.cuda.empty_cache()

                        self.log.debug(
                            f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )
                except EndOfSplit:

                    end_time = datetime.datetime.now()
                    train_time_duration = end_time - epoch_start_time
                    # A human readable duration st down to minutes:
                    duration_str = FileUtils.time_delta_str(
                        train_time_duration, granularity=4)

                    self.log.info(
                        f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})"
                    )

                    #***********
                    #print(f"****** num_batches in split: {batch_num}" )
                    #print(f"****** LblDist: {label_distrib}")
                    #***********
                    self.validate_split(step_num)
                    self.visualize_step(step_num)
                    # Save model, keeping self.model_archive_size models:
                    self.model_archive.save_model(self.model, epoch)

                    self.log.debug(
                        f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                    )

                    # Next Epoch
                    continue

            end_time = datetime.datetime.now()
            train_time_duration = end_time - split_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(train_time_duration,
                                                    granularity=4)

            self.log.info(
                f"Done training split {split_num} (duration: {duration_str})")

            # Next split
            continue

        end_time = datetime.datetime.now()
        epoch_duration = end_time - epoch_start_time
        epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4)

        cumulative_dur = end_time - overall_start_time
        cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4)

        msg = f"Done epoch {epoch}  (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})"
        self.log.info(msg)

        #******self.scheduler.step()

        # Fresh results tallying
        #self.results.clear()

        self.log.info(
            f"Training complete after {self.train_loader.num_folds} splits")

        # Report the sanity checks:
        self.log.info(f"Total batches processed: {total_batch_num}")
        for cid in self.class_coverage.keys():
            train_use, val_use = self.class_coverage[cid].items()
            self.log.info(
                f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}"
            )

        # All seems to have gone well. Report the
        # overall result of the final epoch for the
        # hparms config used in this process:

        self.report_hparams_summary(self.latest_result)

        # The final epoch number:
        return epoch

    #------------------------------------
    # validate_split
    #-------------------

    def validate_split(self, step):
        '''
        Validate one split, using that split's 
        validation fold. Return time taken. Record
        results for tensorboard and other record keeping.
        
        :param step: current combination of epoch and 
            split
        :type step: int
        :return: number of epoch seconds needed for the validation
        :rtype: int
        '''
        # Validation

        self.log.debug(
            f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        start_time = datetime.datetime.now()
        self.log.info(f"Starting validation for step {step}")

        self.model.eval()
        with torch.no_grad():
            for img_tensor, target in self.train_loader.validation_samples():
                expanded_img_tensor = unsqueeze(img_tensor, dim=0)
                expanded_target = unsqueeze(target, dim=0)

                # Update sanity record:
                self.class_coverage[int(target)]['val'] += 1

                images = FileUtils.to_device(expanded_img_tensor, 'gpu')
                label = FileUtils.to_device(expanded_target, 'gpu')

                outputs = self.model(images)
                loss = self.loss_fn(outputs, label)

                images = FileUtils.to_device(images, 'cpu')
                outputs = FileUtils.to_device(outputs, 'cpu')
                label = FileUtils.to_device(label, 'cpu')
                loss = FileUtils.to_device(loss, 'cpu')

                self.remember_results(LearningPhase.VALIDATING, step, outputs,
                                      label, loss)
                del images
                del outputs
                del label
                del loss
                torch.cuda.empty_cache()

        end_time = datetime.datetime.now()
        val_time_duration = end_time - start_time
        # A human readable duration st down to minues:
        duration_str = FileUtils.time_delta_str(val_time_duration,
                                                granularity=4)
        self.log.info(f"Done validation (duration: {duration_str})")

        return val_time_duration

    # ------------- Utils -----------

    #------------------------------------
    # report_acc_loss
    #-------------------

    def report_acc_loss(self, phase, epoch, accumulated_loss):

        self.writer.add_scalar(f"loss/{phase}", accumulated_loss, epoch)

    #------------------------------------
    # remember_results
    #-------------------

    def remember_results(
        self,
        phase,
        step,
        outputs,
        labels,
        loss,
    ):

        # Add the results
        tally = ResultTally(step, phase, outputs, labels, loss,
                            self.num_classes, self.batch_size)
        # Add result to intermediate results collection of
        # tallies:
        self.results[step] = tally

        # Same with the session-wide
        # collection:

        self.step_results.add(tally)

    #------------------------------------
    # visualize_step
    #-------------------

    def visualize_step(self, step):
        '''
        Take the ResultTally instances
        in the train and val ResultCollections
        in self.results, and report appropriate
        aggregates to tensorboard. Computes
        f1 scores, accuracies, etc. for given
        step.

        Separately for train and validation
        results: build one long array 
        of predictions, and a corresponding
        array of labels. Also, average the
        loss across all instances.
        
        The preds and labels as rows to csv 
        files.

        '''

        val_tally = self.results[(step, str(LearningPhase.VALIDATING))]
        train_tally = self.results[(step, str(LearningPhase.TRAINING))]

        result_coll = ResultCollection()
        result_coll.add(val_tally, step)
        result_coll.add(train_tally, step)

        self.latest_result = {'train': train_tally, 'val': val_tally}

        # If we are to write preds and labels to
        # .csv for later additional processing:

        if self.csv_writer is not None:
            self.csv_writer.writerow([
                step, train_tally.preds, train_tally.labels, val_tally.preds,
                val_tally.labels
            ])

        TensorBoardPlotter.visualize_step(
            result_coll, self.writer,
            [LearningPhase.TRAINING, LearningPhase.VALIDATING], step,
            self.class_names)
        # History of learning rate adjustments:
        lr_this_step = self.optimizer.param_groups[0]['lr']
        self.writer.add_scalar('learning_rate', lr_this_step, global_step=step)

    #------------------------------------
    # visualize_final_epoch_results
    #-------------------

    def visualize_final_epoch_results(self, epoch):
        '''
        Reports to tensorboard just for the
        final epoch.
 
        Expect self.latest_result to be the latest
        ResultTally.
        '''
        # DISPLAY_HISTORY_LEN holds the number
        # of historic epochs we will show. Two
        # results per epochs --> need
        # 2*DISPLAY_HISTORY_LEN results. But check
        # that there are that many, and show fewer
        # if needed:

        num_res_to_show = min(len(self.step_results),
                              2 * self.DISPLAY_HISTORY_LEN)

        f1_hist = self.step_results[-num_res_to_show:]

        # First: the table of train and val f1-macro
        # scores for the past few epochs:
        #
        #      |phase|ep0  |ep1 |ep2 |
        #      |-----|-----|----|----|
        #      |train| f1_0|f1_1|f1_2|
        #      |  val| f1_0|f1_1|f1_2|

        f1_macro_tbl = TensorBoardPlotter.make_f1_train_val_table(f1_hist)
        self.writer.add_text('f1/history', f1_macro_tbl)

        # Now, in the same tensorboard row: the
        # per_class train/val f1 scores for each
        # class separately:
        #
        # |class|weighted mean f1 train|weighted mean f1 val|
        # |-----|----------------------|--------------------|
        # |  c1 |0.1                   |0.6                 |
        # |  c2 |0.1                   |0.6                 |
        # |  c3 |0.1                   |0.6                 |
        # ------|----------------------|--------------------|

        f1_all_classes = TensorBoardPlotter.make_all_classes_f1_table(
            self.latest_result, self.class_names)
        self.writer.add_text('f1/per-class', f1_all_classes)

    #------------------------------------
    # report_hparams_summary
    #-------------------

    def report_hparams_summary(self, latest_result):
        '''
        Called at the end of training. Constructs
        a summary to report for the hyperparameters
        used in this process. Reports to the tensorboard.
         
        Hyperparameters reported:
         
           o lr
           o optimizer
           o batch_size
           o kernel_size
         
        Included in the measures are:
         
           o balanced_accuracy      (train and val)
           o mean_accuracy_train    (train and val)
           o epoch_prec_weighted
           o epoch_recall_weighted
           o epoch_mean_loss        (train and val)
           
         
        :param latest_result: dict with keys 'train' and
            'val', holding the respective most recent
            (i.e. last-epoch) ResultTally
        :type latest_result: {'train' : ResultTally,
                               'val'   : ResultTally
                               }
        '''

        # Get the latest validation tally:
        train_tally = latest_result['train']
        val_tally = latest_result['val']

        hparms_vals = OrderedDict({
            'net':
            self.net_name,
            'pretrained':
            f"{self.pretrained}",
            'lr_initial':
            self.config.Training.lr,
            'optimizer':
            self.config.Training.opt_name,
            'batch_size':
            self.config.getint('Training', 'batch_size'),
            'kernel_size':
            self.config.getint('Training', 'kernel_size'),
            'to_grayscale':
            self.to_grayscale
        })

        metric_results = {
            'zz_balanced_adj_acc_train': train_tally.balanced_acc,
            'zz_balanced_adj_acc_val': val_tally.balanced_acc,
            'zz_acc_train': train_tally.accuracy,
            'zz_acc_val': val_tally.accuracy,
            'zz_epoch_weighted_prec': val_tally.prec_weighted,
            'zz_epoch_weighted_recall': val_tally.recall_weighted,
            'zz_epoch_mean_loss_train': train_tally.mean_loss,
            'zz_epoch_mean_loss_val': val_tally.mean_loss
        }

        self.writer.add_hparams(hparms_vals, metric_results)

    #------------------------------------
    # get_dataloader
    #-------------------

    def get_dataloader(self,
                       sample_width,
                       sample_height,
                       perc_data_to_use=None):
        '''
        Returns a cross validating dataloader. 
        If perc_data_to_use is None, all samples
        under self.root_train_test_data will be
        used for training. Else percentage indicates
        the percentage of those samples to use. The
        selection is random.
        
        :param sample_width: pixel width of returned images
        :type sample_width: int
        :param sample_height: pixel height of returned images
        :type sample_height: int
        :param perc_data_to_use: amount of available training
            data to use.
        :type perc_data_to_use: {None | int | float}
        :return: a data loader that serves batches of
            images and their assiated labels
        :rtype: CrossValidatingDataLoader
        '''

        data_root = self.root_train_test_data

        train_dataset = SingleRootImageDataset(data_root,
                                               sample_width=sample_width,
                                               sample_height=sample_height,
                                               percentage=perc_data_to_use,
                                               to_grayscale=True)

        sampler = SKFSampler(train_dataset,
                             num_folds=self.num_folds,
                             seed=42,
                             shuffle=True,
                             drop_last=True)

        train_loader = CrossValidatingDataLoader(train_dataset,
                                                 batch_size=self.batch_size,
                                                 shuffle=True,
                                                 drop_last=True,
                                                 sampler=sampler,
                                                 num_folds=self.num_folds)
        return train_loader

    #------------------------------------
    # initialize_model
    #-------------------

    def initialize_model(self):
        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)

    #------------------------------------
    # find_num_classes
    #-------------------

    def find_num_classes(self, data_root):
        '''
        Expect two subdirectories under data_root:
        train and validation. Underneath each are 
        further subdirectories whose names are the
        classes:
        
                train               validation
        class1 class2 class3     class1 class2 class3
          imgs   imgs   imgs       imgs   imgs   imgs
        
        No error checking to confirm this structure
        
        :param data_root: path to parent of train/validation
        :type data_root: str
        :return: number of unique classes as obtained
            from the directory names
        :rtype: int
        '''
        self.classes = FileUtils.find_class_names(data_root)
        return len(self.classes)

    #------------------------------------
    # setup_tensorboard
    #-------------------

    def setup_tensorboard(self, logdir, raw_data_dir=True):
        '''
        Initialize tensorboard. To easily compare experiments,
        use runs/exp1, runs/exp2, etc.
        
        Method creates the dir if needed.
        
        Additionally, sets self.csv_pred_writer and self.csv_label_writer
        to None, or open CSV writers, depending on the value of raw_data_dir,
        see create_csv_writer()
        
        :param logdir: root for tensorboard events
        :type logdir: str
        '''

        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        # For storing train/val preds/labels
        # for every epoch. Used to create charts
        # after run is finished:
        self.csv_writer = self.create_csv_writer(raw_data_dir)

        # Place to store intermediate models:
        self.model_archive = \
            self.create_model_archive(self.config,
                                      self.num_classes
                                      )

        # Use SummaryWriterPlus to avoid confusing
        # directory creations when calling add_hparams()
        # on the writer:

        self.writer = SummaryWriterPlus(log_dir=logdir)

        # Intermediate storage for train and val results:
        self.results = ResultCollection()

        self.log.info(
            f"To view tensorboard charts: in shell: tensorboard --logdir {logdir}; then browser: localhost:6006"
        )

    #------------------------------------
    # create_csv_writer
    #-------------------

    def create_csv_writer(self, raw_data_dir):
        '''
        Create a csv_writer that will fill a csv
        file during training/validation as follows:
        
            epoch  train_preds   train_labels  val_preds  val_labels
            
        Cols after the integer 'epoch' col will each be
        an array of ints:
        
                  train_preds    train_lbls   val_preds  val_lbls
                2,"[2,5,1,2,3]","[2,6,1,2,1]","[1,2]",    "[1,3]" 
        
        If raw_data_dir is provided as a str, it is
        taken as the directory where csv file with predictions
        and labels are to be written. The dir is created if necessary.
         
        If the arg is instead set to True, a dir 'runs_raw_results' is
        created under this script's directory if it does not
        exist. Then a subdirectory is created for this run,
        using the hparam settings to build a file name. The dir
        is created if needed. Result ex.:
        
              <script_dir>
                   runs_raw_results
                       Run_lr_0.001_br_32
                           run_2021_05_ ... _lr_0.001_br_32.csv
        
        
        Then file name is created, again from the run
        hparam settings. If this file exists, user is asked whether
        to remove or append. The inst var self.csv_writer is
        initialized to:
        
           o None if csv file exists, but is not to 
             be overwritten nor appended-to
           o A filed descriptor for a file open for either
             'write' or 'append.
        
        :param raw_data_dir: If simply True, create dir and file names
            from hparams, and create as needed. If a string, it is 
            assumed to be the directory where a .csv file is to be
            created. If None, self.csv_writer is set to None.
        :type raw_data_dir: {None | True | str|
        :return: CSV writer ready for action. Set either to
            write a fresh file, or append to an existing file.
            Unless file exists, and user decided not to overwrite
        :rtype: {None | csv.writer}
        '''

        # Ensure the csv file root dir exists if
        # we'll do a csv dir and run-file below it:

        if type(raw_data_dir) == str:
            raw_data_root = raw_data_dir
        else:
            raw_data_root = os.path.join(self.curr_dir, 'runs_raw_results')

        if not os.path.exists(raw_data_root):
            os.mkdir(raw_data_root)

        # Can rely on raw_data_root being defined and existing:

        if raw_data_dir is None:
            return None

        # Create both a raw dir sub-directory and a .csv file
        # for this run:
        csv_subdir_name = FileUtils.construct_filename(self.config.Training,
                                                       prefix='Run',
                                                       incl_date=True)
        os.makedirs(csv_subdir_name)

        # Create a csv file name:
        csv_file_nm = FileUtils.construct_filename(self.config.Training,
                                                   prefix='run',
                                                   suffix='.csv',
                                                   incl_date=True)

        csv_path = os.path.join(raw_data_root, csv_file_nm)

        # Get csv_raw_fd appropriately:

        if os.path.exists(csv_path):
            do_overwrite = FileUtils.user_confirm(
                f"File {csv_path} exists; overwrite?", default='N')
            if not do_overwrite:
                do_append = FileUtils.user_confirm(f"Append instead?",
                                                   default='N')
                if not do_append:
                    return None
                else:
                    mode = 'a'
        else:
            mode = 'w'

        csv_writer = CSVWriterCloseable(csv_path, mode=mode, delimiter=',')

        header = [
            'epoch', 'train_preds', 'train_labels', 'val_preds', 'val_labels'
        ]
        csv_writer.writerow(header)

        return csv_writer

    #------------------------------------
    # create_model_archive
    #-------------------

    def create_model_archive(self, config, num_classes):
        '''
        Creates facility for saving partially trained
        models along the way.
        
        :param config:
        :type config:
        :param num_classes:
        :type num_classes:
        :return: ModelArchive instance ready
            for calls to save_model()
        :rtype: ModelArchive
        '''
        model_archive = ModelArchive(config,
                                     num_classes,
                                     history_len=self.MODEL_ARCHIVE_SIZE,
                                     log=self.log)
        return model_archive

    #------------------------------------
    # close_tensorboard
    #-------------------

    def close_tensorboard(self):
        if self.csv_writer is not None:
            try:
                self.csv_writer.close()
            except Exception as e:
                self.log.warn(f"Could not close csv file: {repr(e)}")
        try:
            self.writer.close()
        except AttributeError:
            self.log.warn(
                "Method close_tensorboard() called before setup_tensorboard()?"
            )
        except Exception as e:
            raise RuntimeError(
                f"Problem closing tensorboard: {repr(e)}") from e

    #------------------------------------
    # get_optimizer
    #-------------------

    def get_optimizer(self, optimizer_name, model, lr):

        optimizer_name = optimizer_name.lower()
        if optimizer_name == 'adam':
            optimizer = optim.Adam(model.parameters(),
                                   lr=lr,
                                   eps=1e-3,
                                   amsgrad=True)
            return optimizer

        if optimizer_name == 'sgd':
            optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
            return optimizer

        if optimizer_name == 'rmsprop':
            optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
            return optimizer

        raise ValueError(f"Optimizer {optimizer_name} not supported")

    #------------------------------------
    # initialize_config_struct
    #-------------------

    def initialize_config_struct(self, config_info):
        '''
        Initialize a config dict of dict with
        the application's configurations. Sections
        will be:
        
          config['Paths']       -> dict[attr : val]
          config['Training']    -> dict[attr : val]
          config['Parallelism'] -> dict[attr : val]
        
        The config read method will handle config_info
        being None. 
        
        If config_info is a string, it is assumed either 
        to be a file containing the configuration, or
        a JSON string that defines the config.
         
        Else config_info is assumed to be a NeuralNetConfig.
        The latter is relevant only if using this file
        as a library, rather than a command line tool.
        
        If given a NeuralNetConfig instance, it is returned
        unchanged. 
        
        :param config_info: the information needed to construct
            the structure
        :type config_info: {NeuralNetConfig | str}
        :return a NeuralNetConfig instance with all parms
            initialized
        :rtype NeuralNetConfig
        '''

        if isinstance(config_info, str):
            # Is it a JSON str? Should have a better test!
            if config_info.startswith('{'):
                # JSON String:
                config = NeuralNetConfig.from_json(config_info)
            else:
                config = self.read_configuration(config_info)
        elif isinstance(config_info, NeuralNetConfig):
            config = config_info
        else:
            msg = f"Error: must have a config file, not {config_info}. See config.cfg.Example in project root"
            # Since logdir may be in config, need to use print here:
            print(msg)
            raise ConfigError(msg)

        return config

    #------------------------------------
    # read_configuration
    #-------------------

    def read_configuration(self, conf_file):
        '''
        Parses config file that describes training parameters,
        various file paths, and how many GPUs different machines have.
        Syntax follows Python's configfile package, which includes
        sections, and attr/val pairs in each section.
        
        Expected sections:

           o Paths: various file paths for the application
           o Training: holds batch sizes, number of epochs, etc.
           o Parallelism: holds number of GPUs on different machines
        
        For Parallelism, expect entries like:
        
           foo.bar.com  = 4
           127.0.0.1    = 5
           localhost    = 3
           172.12.145.1 = 6
           
        Method identifies which of the entries is
        'localhost' by comparing against local hostname.
        Though 'localhost' or '127.0.0.1' may be provided.
        
        Returns a dict of dicts: 
            config[section-names][attr-names-within-section]
            
        Types of standard entries, such as epochs, batch_size,
        etc. are coerced, so that, e.g. config['Training']['epochs']
        will be an int. Clients may add non-standard entries.
        For those the client must convert values from string
        (the type in which values are stored by default) to the
        required type. This can be done the usual way: int(...),
        or using one of the configparser's retrieval methods
        getboolean(), getint(), and getfloat():
        
            config['Training'].getfloat('learning_rate')
        
        :param other_gpu_config_file: path to configuration file
        :type other_gpu_config_file: str
        :return: a dict of dicts mirroring the config file sections/entries
        :rtype: dict[dict]
        :raises ValueErr
        :raises TypeError
        '''

        if conf_file is None:
            return self.init_defaults()

        config = DottableConfigParser(conf_file)

        if len(config.sections()) == 0:
            # Config file exists, but empty:
            return (self.init_defaults(config))

        # Do type conversion also in other entries that
        # are standard:

        types = {
            'epochs': int,
            'batch_size': int,
            'kernel_size': int,
            'sample_width': int,
            'sample_height': int,
            'seed': int,
            'pytorch_comm_port': int,
            'num_pretrained_layers': int,
            'root_train_test_data': str,
            'net_name': str,
        }
        for section in config.sections():
            for attr_name in config[section].keys():
                try:
                    str_val = config[section][attr_name]
                    required_type = types[attr_name]
                    config[section][attr_name] = required_type(str_val)
                except KeyError:
                    # Current attribute is not standard;
                    # users of the corresponding value need
                    # to do their own type conversion when
                    # accessing this configuration entry:
                    continue
                except TypeError:
                    raise ValueError(
                        f"Config file error: {section}.{attr_name} should be convertible to {required_type}"
                    )

        return config

    #------------------------------------
    # set_seed
    #-------------------

    def set_seed(self, seed):
        '''
        Set the seed across all different necessary platforms
        to allow for comparison of different models and runs
        
        :param seed: random seed to set for all random num generators
        :type seed: int
        '''
        torch.manual_seed(seed)
        cuda.manual_seed_all(seed)
        # Not totally sure what these two do!
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)

    #------------------------------------
    # time_delta_str
    #-------------------

    def time_delta_str(self, epoch_delta, granularity=2):
        '''
        Takes the difference between two datetime times:
        
               start_time = datetime.datetime.now()
               <some time elapses>
               end_time = datetime.datetime.now()
               
               delta = end_time - start_time
               time_delta_str(delta
        
        Depending on granularity, returns a string like:
        
            Granularity:
                      1  '160.0 weeks'
                      2  '160.0 weeks, 4.0 days'
                      3  '160.0 weeks, 4.0 days, 6.0 hours'
                      4  '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes'
                      5  '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes, 13.0 seconds'
        
            For smaller time deltas, such as 10 seconds,
            does not include leading zero times. For
            any granularity:
            
                          '10.0 seconds'

            If duration is less than second, returns '< 1sec>'
            
        :param epoch_delta:
        :type epoch_delta:
        :param granularity:
        :type granularity:
        '''
        intervals = (
            ('weeks', 604800),  # 60 * 60 * 24 * 7
            ('days', 86400),  # 60 * 60 * 24
            ('hours', 3600),  # 60 * 60
            ('minutes', 60),
            ('seconds', 1),
        )
        secs = epoch_delta.total_seconds()
        result = []
        for name, count in intervals:
            value = secs // count
            if value:
                secs -= value * count
                if value == 1:
                    name = name.rstrip('s')
                result.append("{} {}".format(value, name))
        dur_str = ', '.join(result[:granularity])
        if len(dur_str) == 0:
            dur_str = '< 1sec>'
        return dur_str

    #------------------------------------
    # step_number
    #-------------------

    def step_number(self, epoch, split_num, num_folds):
        '''
        Combines an epoch with a split number into 
        a single integer series as epochs increase,
        and split_num cycles from 0 to num_folds.
        
        :param epoch: epoch to encode
        :type epoch: int
        :param split_num: split number to encode
        :type split_num: int
        :param num_folds: number of folds for CV splitting
            must be contant!
        :type num_folds: int
        :return: an integer the combines epoch and split-num
        :rtype: int
        '''

        step_num = epoch * num_folds + split_num
        return step_num

    #------------------------------------
    # cleanup
    #-------------------

    def cleanup(self):
        '''
        Recover resources taken by collaborating
        processes. OK to call multiple times.
        '''
        # self.clear_gpu()

        try:
            self.writer.close()
        except Exception as e:
            self.log.err(f"Could not close tensorboard writer: {repr(e)}")
Exemplo n.º 19
0
    def __init__(self,
                 dataset,
                 batch_size=32,
                 shuffle=True,
                 seed=42,
                 num_workers=0,
                 pin_memory=False,
                 prefetch_factor=2,
                 drop_last=True,
                 num_folds=10,
                 sampler=None,
                 logger=None):
        '''
        This instance will use cross validation
        as it serves out samples. The client determines
        the number of folds to use. Example for 
        num_folds of 2:
        
         Split1:
           TrainFold1    TrainFold2   ValidationFold  
            sample1      sample2        sample3
            sample4      sample5        sample6

         Split2:
           TrainFold1    TrainFold2   ValidationFold  
            sample3      sample4        sample2
            sample1      sample6        sample5
            
        This dataloader will create two sequences,
        like this:
        
           For use with training:   [sample1, sample4, sample2, sample5]
           For use with validation: [sample4, sample6]
             after the training 
             sequence is used up

        Assuming batch_size of two, this dataloader's
        client will receive one row from each 
        call to next():
        
            [[sample1, sample4],
             [sample2, sample5],
             [None   , None]
             ]
             
        The None tuple indicates that this split has
        been exhausted, and it is time to validate.
        
        The client then calls validation_samples() on
        this dataloader instance to receive one validation
        sample at a time. The client will predict the
        (target) class for each of these validation samples,
        and tally successes and failures. The client should
        then compute the compute validation accuracy from
        that series of successes and failures. 

        Calling next() again will create a new split,
        and again feed out the samples in the respective
        new folds.
        
        The feed terminates after as many splits as there
        are folds. Any following call to next() will raise
        a StopIteration exception.

        :param dataset: underlying map-store that 
                supplies(img_torch, label) tuples
        :type dataset: BirdDataset
        :param batch_size: number of samples to combine into 
            a batch to feed model during training
        :type batch_size: int
        :param shuffle: whether or not to shuffle the
            dataset once, initially.
        :type shuffle: bool
        :param seed: random seed to use if shuffle is True
        :type shuffle: int
        :param num_workers: number of threads used to preload
        :type num_workers: int
        :param pin_memory: set to True if using a GPU. Speeds
            transfer of tensors from CPU to GPU
        :type pin_memory: bool
        :param prefetch_factor: how many samples to prefetch from
            underlying database to speed access to file system
        :type prefetch_factor: int
        :param drop_last: whether or not to serve only partially 
            filled batches. Those occur when samples cannot be
            evenly packed into batches. 
        :type drop_last: bool
        :param num_folds: the 'k' in k-fold cross validation
        :type num_folds: int
        :param sampler: Only used when MultiprocessingDataLoader
            is being instantiated, and that class's __init__()
            calls super(). Leave out for singleprocess/single-GPU
            use
        :type sampler: {None | DistributedSKFSampler}
        :param logger: the LoggingService instance to use
            for logging info/warnings/errors. If None, fetches
            the LoggingService singleton.
        :type logger: LoggingService
        '''

        if len(dataset) == 0:
            raise ValueError("Dataset is empty, nothing to load")

        self.drop_last = drop_last

        if logger is None:
            self.log = LoggingService()
        else:
            self.log = logger

        # Sampler will only be set if a subclass instance
        # of MultiprocessingDataLoader is being initialized.
        # Else, running single process:

        if sampler is None:
            self.sampler = SKFSampler(dataset,
                                      num_folds=num_folds,
                                      shuffle=shuffle,
                                      drop_last=drop_last,
                                      seed=seed)
        else:
            self.sampler = sampler

        if not isinstance(batch_size, int) or batch_size <= 0:
            msg = f"Batch size must be a positive int, not "

            # Complete the error msg according which of
            # the two failure conditions occurred:
            msg += type(batch_size).__name__ if not isinstance(batch_size, int)\
                                           else f"{batch_size}"

            raise ValueError(msg)

        self.batch_size = batch_size

        self.num_folds = num_folds

        # Total num of batches served when
        # rotating through all folds is computed
        # the first time __len__() is called:

        self.num_batches = None
        self.curr_split_idx = -1

        super().__init__(dataset,
                         batch_size=batch_size,
                         sampler=self.sampler,
                         num_workers=num_workers,
                         pin_memory=pin_memory,
                         prefetch_factor=prefetch_factor,
                         drop_last=drop_last)
Exemplo n.º 20
0
class TensorBoardPlotter:
    '''
    Support functionality for creating custom 
    graphs and images for submission to Tensorboard.
    Services include:
    
        o Create confusion matrix images
        o Bar charts for number of samples in each class
        o Placing a grid of images on Tensorboard
        o Writing (i.e. overlaying) text onto images
        
    No SummaryWriter is created. A writer is always
    passed in
    '''
    DISPLAY_HISTORY_LEN = 8

    log = LoggingService()

    #------------------------------------
    # collection_to_tensorboard
    #-------------------

    @classmethod
    def collection_to_tensorboard(cls, tally_coll, writer, phases, step):
        '''
        Reports standard results from all tallies
        in the given collection to a tensorboard.
        Included are:
        
            o Various charts
            o Result text tables
            o hparams
        
        :param tally_coll:
        :type tally_coll:
        '''
        cls.visualize_step(tally_coll, writer, phases, step)

    #------------------------------------
    # visualize_step
    #-------------------

    @classmethod
    def visualize_step(cls, tally_coll, writer, phases, step, class_names):
        '''
        Take the ResultTally instances from
        the same step from the tally_coll, 
        and report appropriate aggregates to 
        tensorboard. 
        
        Computes f1 scores, accuracies, etc. for 
        given step.

        Separately for train and validation
        results: build one long array 
        of predictions, and a corresponding
        array of labels. Also, average the
        loss across all instances.
        
        The the preds and labels as rows to csv 
        files.
        
        :return: a ResultTally instance with all
            metrics computed for display
        :rtype: ResultTally
        '''

        try:
            tallies = {
                str(phase): tally_coll[(step, phase)]
                for phase in phases
            }
        except KeyError as e:
            cls.log.err(f"Step: {step}, phases: {phases}: {repr(e)}")
            return

        for phase in phases:

            # Need learning phase in string forms
            # below:
            phase_str = str(phase)

            tally = tallies[phase_str]
            writer.add_scalar(f"loss/{phase_str}",
                              tally.mean_loss,
                              global_step=step)

            writer.add_scalar(f"balanced_accuracy_score/{phase_str}",
                              tally.balanced_acc,
                              global_step=step)

            writer.add_scalar(f"accuracy_score/{phase_str}",
                              tally.accuracy,
                              global_step=step)

            # The following are only sent to the
            # tensorboard for validation and test
            # phases.

            if phase in (LearningPhase.VALIDATING, LearningPhase.TESTING):

                # Submit the confusion matrix image
                # to the tensorboard. In the following:
                # do not provide a separate title, such as
                #  title=f"Confusion Matrix (Validation): Step{step}"
                # That would put each matrix into its own slot
                # on tensorboard, rather than having a time slider

                TensorBoardPlotter.conf_matrix_to_tensorboard(
                    writer,
                    tally.conf_matrix,
                    class_names,
                    step=step,
                    title=f"Confusion Matrix Series")

                # Versions of the f1 score:

                writer.add_scalar(f"{phase_str}_f1/macro",
                                  tally.f1_macro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_f1/micro",
                                  tally.f1_micro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_f1/weighted",
                                  tally.f1_weighted,
                                  global_step=step)

                # Versions of precision/recall:

                writer.add_scalar(f"{phase_str}_prec/macro",
                                  tally.prec_macro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_prec/micro",
                                  tally.prec_micro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_prec/weighted",
                                  tally.prec_weighted,
                                  global_step=step)

                writer.add_scalar(f"{phase_str}_recall/macro",
                                  tally.recall_macro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_recall/micro",
                                  tally.recall_micro,
                                  global_step=step)
                writer.add_scalar(f"{phase_str}_recall/weighted",
                                  tally.recall_weighted,
                                  global_step=step)

        return tally

    #------------------------------------
    # conf_matrix_to_tensorboard
    #-------------------

    @classmethod
    def conf_matrix_to_tensorboard(cls,
                                   writer,
                                   conf_matrix,
                                   step=0,
                                   title='Confusion Matrix'):
        '''
        Add confusion matrix to tensorboard as an
        image. Multiple conf matrices (from multiple steps)
        maybe be overlaid. Tensorboard will add a slider to 
        run through them.
        
        :param writer: tensorboard writer
        :type writer: tensorboard.SummaryWriter 
        :param conf_matrix: confusion matrix to draw as heatmap
        :type conf_matrix: pd.DataFrame
        :param step: the step number that generated the matrix
        :type step: int
        :param title: title to add at the image
        :type title: str
        '''
        conf_matrix_fig = Charter.fig_from_conf_matrix(conf_matrix,
                                                       supertitle=title)
        writer.add_figure(title, conf_matrix_fig, global_step=step)

    #------------------------------------
    # class_support_to_tensorboard
    #-------------------

    @classmethod
    def class_support_to_tensorboard(cls,
                                     data_src,
                                     writer,
                                     step=0,
                                     title='Class Support'):
        '''
        Create a barchart showing number of training samples
        in each class. The chart is converted to
        a tensor, and submitted to tensorboard.
        
        The data_src may be:
        
           o a dataset in the pytorch sense, or 
           o a full path the root of a training data directory, or
           
        If custom_data is None, a barchart with number of samples
        in each class is created. Else custom_data is expected
        to be a dict mapping class-id => num-samples in that class.
        If provided, this data is bar-charted instead of the
        entire dataset's distribution

        :param data_src: either a path to samples,
            or a dataset
        :type data_src: {str | {int : int} | torch.utils.data.Dataset}
        :param writer: a tensorboard summary writer
        :type writer: tensorboard.SummaryWriter
        :param step: step for which support is shown
        :type step: int
        :param custom_data: an optional dict {class-id : sample-count} whose
            per-class count is to be bar-charted instead of the entire
            dataset
        :type custom_data: {int : int}
        :param title: optional title above the figure
        :type title: str
        :return: dict {<class_name> : <num_samples_for_class_name>}
            i.e. number of samples in each class. 
        :rtype: {str : int}
        '''

        if type(data_src) == str:
            # Data source is file path to
            # root of training data. Create
            # a dataset from that tree:
            dataset = SingleRootImageDataset(data_src)
        elif type(data_src) != SingleRootImageDataset:
            raise ValueError(
                f"Data source must be path to data root, or a dataset, not {data_src}"
            )
        else:
            dataset = data_src

        # Get dict: {<class_id> : <class_name>}
        class_id_to_name = {
            class_id: class_name
            for class_name, class_id in dataset.class_to_id.items()
        }

        # Goal is corresponding np arrays:
        #    class-name, num-samples-in-class.

        # First, get correponding tuples of
        # class *ids* and sample counts. The
        # 'zip(*<list-of-tuples>) notation is
        # the inverse of a zip():
        # take [(c1,n1), (c2,n2),...] that is returned
        # from sample_distribution(), and create two
        # arrays: [c1,c2,...], and [n1,n2,...]

        [class_id_tuple,
         sample_count_tuple] = zip(*dataset.sample_distribution())

        # Create np array of class *names* from the class ID tuple:
        class_names = np.array(
            [class_id_to_name[class_id] for class_id in class_id_tuple])
        sample_counts = np.array(sample_count_tuple)

        # Make a horizontal chart, so class names are
        # Y-axis labels:
        y_pos = np.arange(len(class_names))

        fig, ax = plt.subplots()
        fig.suptitle('Number of Samples in Each Class')
        _bar_container = ax.barh(
            y_pos,
            sample_counts,  # Bar length (i.e. width) 
            tick_label=class_names,
            align='center')
        ax.set_xlabel('Number of Samples')

        # Convert matplotlib figure into
        # an image tensor for tensorboard:

        writer.add_figure(title, fig, step)

        support_dict = {
            class_name: num_samples
            for class_name, num_samples in zip(class_names, sample_counts)
        }
        return support_dict

    #------------------------------------
    # add_image
    #-------------------

    @classmethod
    def add_image(
            cls,
            writer,
            tag,
            img_path,
            step=0,
            to_grayscale=True,
            img_height=200,  # px
            img_width=400  # px
    ):
        '''
        Writes a single image to tensorboard.
        Can resize image or turn to grayscale
        if requested. If img_width or img_height
        is None, no scaling is done.
        
        :param writer: the SummaryWriter to use
        :type writer: SummaryWriter
        :param tag: the name of the image in 
            tensorboard display
        :type tag: str
        :param img_path: full path to image
        :type img_path: str
        :param step: step
        :type step: int
        :param to_grayscale: whether or not to conver
            to grayscale
        :type to_grayscale: bool
        :param img_height: desired image height
        :type img_height: int
        :param img_width: desired image width
        :type img_width: int
        '''

        the_transforms = []
        if img_height is not None and img_width is not None:
            the_transforms.append(transforms.Resize((img_height, img_width)))
        if to_grayscale:
            the_transforms.append(transforms.Grayscale())
        the_transforms.append(transforms.ToTensor())

        transform_img = transforms.Compose(the_transforms)
        img = Image.open(img_path)
        img = transform_img(img).float()

        # A 10px frame around each img:
        #grid = make_grid(img, padding=10)
        #writer.add_image(tag, grid, step)

        writer.add_image(tag, img, step)

    #------------------------------------
    # write_img_grid
    #-------------------

    @classmethod
    def write_img_grid(
            cls,
            writer,
            img_root_dir,
            num_imgs=4,
            class_sample_file_pairs=None,
            img_height=200,  # px
            img_width=400,  # px
            to_grayscale=True,
            unittesting=False):
        '''
        Create and log a Tensorboard 'grid' of
        example train images. The img_root_dir must
        be the 'data root': the dir holding one subdir
        per class.

        :param writer: a Tensorboard Pytorch SummaryWriter
        :type writer: SummaryWriter
        :param img_root_dir: directory 
            that contains sub-directories with samples. The 
            sub-directory names are taken to be class names.  
        :type img_root_dir: str
        :param num_imgs: total number of images to
            include in the grid. If None: all images
        :type num_imgs: {None | int}
        :param class_sample_file_pairs: <class>/<img_file_name> for
            individual images if random choice is not wanted.
        :type class_sample_file_pairs: {None | str | [str]}
        :param img_height: height of all images
        :type img_height: int (pixels)
        :param img_width: width of all images
        :type img_width: int (pixels)
        :param to_grayscale: whether or not to convert 
            images to grayscale upon import
        :type to_grayscale: bool
        :param unittesting: controls whether grid is
            actually created, or the img tensor that
            would be contained in the grid is returned
            for testing dimensions.
        :type unittesting: bool 
        '''

        if img_root_dir is None:
            raise ValueError("Must provide path to image root dir")

        # Prepare to resize all images to a given dimension,
        # convert to grayscale if requested, and turn into
        # a tensor:
        the_transforms = [transforms.Resize((img_height, img_width))]
        if to_grayscale:
            the_transforms.append(transforms.Grayscale())
        the_transforms.append(transforms.ToTensor())

        transform_img = transforms.Compose(the_transforms)

        # Get an ImageFolder instance, from which
        # we will easily find classes and samples

        img_folder = ImageFolder(img_root_dir,
                                 transform=transform_img,
                                 loader=default_loader)

        # Get list of full paths to samples:
        sample_idxs = cls._get_sample_indices(
            img_folder,
            num_imgs=num_imgs,
            class_sample_file_pairs=class_sample_file_pairs)
        # Get list of img tensor/class_idx pairs:
        img_tns_list = [img_folder[idx] for idx in sample_idxs]

        # Print <class>/file_name onto
        # each spectrogram:

        marked_img_tns_list = []
        for i, (img_tns, class_idx) in enumerate(img_tns_list):
            class_name = img_folder.classes[class_idx]
            # img_folder.samples is [ (full_path, class_idx), (..., ...) ]:
            img_file_basename = os.path.basename(img_folder.samples[i][0])
            marked_img_tns_list.append(
                cls.print_onto_image(img_tns,
                                     f"{class_name}/{img_file_basename}"))
        # Turn list of img tensors into
        # a single tensor with first dim
        # being len of list:
        marked_img_tns = torch.cat(marked_img_tns_list)

        # A 10px frame around each img:
        grid = make_grid(marked_img_tns, padding=10)

        if unittesting:
            return grid
        writer.add_image('Train Input Examples', grid)
        return grid

    #------------------------------------
    # make_f1_train_val_table
    #-------------------

    @classmethod
    def make_f1_train_val_table(cls, res_list):
        '''
        Return a github flavored table:
           |phase|ep0  |ep1 |ep2 |
           |-----|-----|----|----|
           |train| f1_0|f1_1|f1_2|
           |  val| f1_0|f1_1|f1_2|
           
        for half as many steps back as there are
        tallies available in the list of ResultTally
        instances in step_results.
        
        Assumption: exactly two ResultTallies are provided
        in res_list. One each for train and validation 
        results.
           
        :param res_list: list of ResultTally
            instances in oldest-step-first order
        :type res_list: [ResultTally]
        :return: a table
        :rtype: str
        '''
        res_len = len(res_list)

        # Could catch the following error.
        # But it's just a special case of
        # num train tallies unequal to num
        # of val tallies. Wait till we catch
        # that root problem later:

        # Should be an even number of result
        # objs:
        #if res_len % 2 != 0:
        #    raise ValueError("Must provide two ResultTally instances per step")

        num_steps = res_len // 2

        # First the header:
        header = []
        for i in range(num_steps):
            header.append(f"f1-macro ep{i}")

        # The f1 value results for both
        # train and val:
        train_f1s = filter(
            lambda res_tally: res_tally.phase == LearningPhase.TRAINING,
            res_list)
        val_f1s = filter(
            lambda res_tally: res_tally.phase == LearningPhase.VALIDATING,
            res_list)

        train_row = []
        for res in train_f1s:
            train_row.append(str(round(res.f1_macro, 1)))

        val_row = []
        # Second row: f1's for validation results:
        for res in val_f1s:
            val_row.append(str(round(res.f1_macro, 1)))

        if len(val_row) != len(train_row):
            raise ValueError(
                f"Must have equal num of train/val tallies; have {len(val_row)} vals and {len(train_row)} trains"
            )

        tbl_content = {
            'col_header': header,
            'row_labels': ['training', 'validation'],
            'rows': [train_row, val_row]
        }
        tbl = GithubTableMaker.make_table(tbl_content)

        return tbl

    #------------------------------------
    # make_all_classes_f1_table
    #-------------------

    @classmethod
    def make_all_classes_f1_table(cls, latest_result, class_names):
        '''
        Return a github flavored table with
        with train and val f1 values for every
        class:
        
            |class|weighted mean f1 train|weighted mean f1 val|
            |-----|----------------------|--------------------|
            |  c1 |     0.1              |   0.6              |
            |  c2 |     0.1              |   0.6              |
            |  c3 |     0.1              |   0.6              |
            ---------------------------------------------------
        
        '''

        # Get the 'all-classes' version of f1 from
        # the last ResultTally for both train and val:
        t_f1s = latest_result['train'].f1_all_classes
        v_f1s = latest_result['val'].f1_all_classes

        if t_f1s is None or \
           v_f1s is None or \
           len(t_f1s) == 0 or\
           len(t_f1s) == 0:
            raise ValueError(
                "Both, train and val values of f1_all_classes must be non-empty lists"
            )

        # Get [[c1_train, c1_val],
        #      [c2_train, c2_val],
        #             ...
        #      ]
        res = torch.tensor([t_f1s, v_f1s]).T

        header = ['weighted mean f1 train', 'weighted mean f1 val']

        # And the f1 train/val numbers, one
        # class in each row:
        row_labels = []
        rows = []
        for class_name, (f1_train, f1_val) in zip(class_names, res):
            f1_train = round(float(f1_train), 1)
            f1_val = round(float(f1_val), 1)
            row_labels.append(class_name)
            rows.append([f1_train, f1_val])

        tbl_content = {
            'col_header': header,
            'row_labels': row_labels,
            'rows': rows
        }
        tbl = GithubTableMaker.make_table(tbl_content)
        return tbl

    #------------------------------------
    # print_onto_image
    #-------------------

    @classmethod
    def print_onto_image(cls, img_src, txt, point=(10, 10)):
        '''
        Given an image, writes given text onto the image.
        Returns a tensor of the new image. Acceptable as image
        sources are:
        
            o File path to jpg, png, etc.
            o A tensor
            o A PIL image
            
        :param img_src: image, or a way to get the image
        :type img_src: {str | Tensor | PIL}
        :param txt: text to be printed onto the image
        :type txt: str
        :param point: where to place the text. In pixels,
            origin upper left
        :type point: [int,int]
        :return: new image with text 'burned' onto it
        :rtype: Tensor
        '''

        if type(img_src) == str:
            # Image is a path:
            try:
                pil_img = Image.open(img_src)
            except Exception as e:
                raise ValueError(
                    f"Could not load img from {img_src}: {repr(e)}")

        elif type(img_src) == torch.Tensor:
            try:
                pil_img = transforms.ToPILImage()(img_src.squeeze_(0))
            except Exception as e:
                raise ValueError(
                    f"Could not convert tensor to PIL img ({img_src.size()})")

        elif not Image.isImageType(img_src):
            raise ValueError(
                f"Image src must be path to img, tensor, or PIL image; not {type(img_src)}"
            )

        else:
            pil_img = img_src

        # Make a blank image for the text.
        # Match the mode (RGB/RGBA/L/...):

        txt_img = Image.new(pil_img.mode, pil_img.size, 255)

        # get a font
        fnt = ImageFont.load_default()
        # get a drawing context
        drawing = ImageDraw.Draw(txt_img)

        # Draw text, half opacity
        drawing.text(point, txt, font=fnt)  #******, fill=(0,0,0,128))
        # Draw text, full opacity
        #drawing.text(point, txt, font=fnt, fill=(255,255,255,255))

        #*****out_img = Image.alpha_composite(pil_img, txt_img)
        out_img = Image.blend(pil_img, txt_img, 0.5)

        out_tns = transforms.ToTensor()(out_img).unsqueeze_(0)
        #out_img.show()
        out_img.close()

        return out_tns

    #------------------------------------
    # _get_sample_indices
    #-------------------

    @classmethod
    def _get_sample_indices(cls,
                            img_folder,
                            class_sample_file_pairs,
                            num_imgs=None):
        '''
        If class_sample_file_pairs is provided,
        then num_imgs is ignored.
        
        :param img_folder: folder instance with training images
        :type img_folder: ImageFolder
        :param class_sample_file_pairs: optionally, pairs of 
            class-name and path to training images
        :type class_sample_file_pairs: [(<class-name>, <sample-file-name>)]
        :param num_imgs: for how many images to create spectrograms 
        :type num_imgs: int
        :return: a list of sample IDs
        :rtype: int
        '''

        # Caller requests particular images?
        if class_sample_file_pairs is not None:

            # Convert the (<class-name>,<sample-file_name>)
            # pairs to (<class_idx>,<sample-file-name>)
            requested_class_idx_sample_pairs = [
                (img_folder.class_to_idx[class_name], sample_file_nm)
                for class_name, sample_file_nm in class_sample_file_pairs
            ]

            # Make a more convenient dict
            #   {class-idx : [<sample-file-name>]
            requests = {}
            for class_idx, sample_path in requested_class_idx_sample_pairs:
                try:
                    requests[class_idx].append(sample_path)
                except KeyError:
                    # First sample file for this class:
                    requests[class_idx] = [sample_path]

            found_idxs = []
            for i, (sample_path, class_idx) in enumerate(img_folder.samples):
                try:
                    if os.path.basename(sample_path) in requests[class_idx]:
                        found_idxs.append(i)
                except KeyError:
                    # Not one of the requested samples:
                    continue
            return found_idxs

        # We are asked to randomly pick images
        # from each class:
        num_samples = len(img_folder)
        num_classes = len(img_folder.classes)
        num_samples_to_get = num_samples \
                            if num_imgs is None \
                            else min(num_samples, num_imgs)

        # Create a dict {class-idx : <list of indices into img_folder>}
        # I.e. for each class, list the int indices i
        # such that img_folder[i] is an img in the class.
        #

        class_dict = {}
        for i, (sample_path, class_idx) in enumerate(img_folder.samples):
            try:
                class_dict[class_idx].append(i)
            except KeyError:
                # First sample of this class:
                class_dict[class_idx] = [i]

        # Rough number of images to get per class:
        num_imgs_per_class = round(num_samples_to_get / num_classes)
        _remaining_imgs = num_samples_to_get % num_classes

        to_get_idxs = []
        for class_idx, sample_idx_list in class_dict.items():
            # Get as many random picks from round's classs
            # sample IDs as we want samples per class:

            # Do we have fewer samples in this class than
            # we want from each class?
            if len(sample_idx_list) < num_imgs_per_class:
                # Yes: grab them all:
                to_get_idxs.extend(sample_idx_list)
            else:
                sample_idxs = random.sample(sample_idx_list,
                                            num_imgs_per_class)
                to_get_idxs.extend(sample_idxs)

        return to_get_idxs
Exemplo n.º 21
0
class TrainScriptRunner(object):
    '''
    classdocs
    '''

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 starting_config_src,
                 hparms_spec,
                 training_script=None,
                 logfile=None,
                 quiet=False,
                 dryrun=False,
                 unittesting=False):
        '''
        Specifications expected like this
        *Ordered* dict (i.e. sequence of 
        keys and values always the same for
        keys()/values()/items() methods:
        
            {<hparm1> : [val1_1, val1_2, ...],
             <hparm2> : [val2_1, val2_2, ...]
             }
        
        :param starting_config_src: a configuration 
            whose neural net related parameters will 
            be modified below for each run.
        :type starting_config_src: {str | NeuralNetConfig}            
        :param hparms_spec:
        :type hparms_spec:
        :param training_script: path to the training script
            of which to run multiple copies. If None, will
            look in config for Path:train_script.
        :type training_script: {None | str}
        :param logfile: where to log runtime information. If None,
            log to console
        :type logfile: {None | str}
        :param quiet: whether or not to report progress
        :type quiet: bool
        :param unittesting: set to True if unittesting so that
            __init__() will only do a minimum, and allows unittests
            to call other methods individually
        :type bool
        '''

        if logfile is not None:
            self.log = LoggingService(logfile=logfile)
        else:
            self.log = LoggingService()

        self.quiet = quiet

        self.curr_dir = os.path.dirname(__file__)
        self.hostname = socket.getfqdn()
        # No GPUs identified so far:
        self.WORLD_SIZE = 0

        starting_config = NeuralNetConfig(starting_config_src)
        if unittesting:
            # Leave calling of the methods below
            # to the unittests
            return

        self.training_script = training_script
        if training_script is None:
            # Try to find it in config:
            try:
                self.training_script = starting_config.getpath(
                    'Paths', 'train_script', relative_to=self.curr_dir)
            except KeyError:
                raise ValueError(
                    "Did not provide training script path on cmd line or in config"
                )

        self.gpu_landscape = self.obtain_world_map(starting_config)

        # Get list of dicts of hparm-name/hparm_value pairs;
        # one for each of the runs

        the_run_dicts = self.get_runs_hparm_specs(hparms_spec)

        # Turn the run dicts into configurations
        # that that modify the starting config:
        the_run_configs = self.gen_configurations(starting_config,
                                                  the_run_dicts)

        if dryrun:
            print("Dryrun:")
            print(
                f"Would run {len(the_run_dicts)} processes with these configs:"
            )
            for configs in the_run_dicts:

                print(configs)
            return

        # Provide support for cnt-c terminating the training
        # script processes nicely:

        self.cnt_c_received = False
        signal.signal(signal.SIGTERM, self.handle_cnt_c)
        # Start one training script for each configuration:
        self.run_configurations(the_run_configs)

    #------------------------------------
    # get_runs_hparm_specs
    #-------------------

    def get_runs_hparm_specs(self, hparms_spec):
        '''
        Create a list of dicts. Each dict 
        holds the value for each of the hparms
        for one run.
        
        :param hparms_spec: client's dict of 
            {param_name : [val1, val2, ...]}
        :type hparms_spec: {str : [Any]}
        :return: list of dicts
        '''

        # Running example:

        #     {'lr'         : [0.001],
        #      'optimizer'  : ['Adam','RMSprop','SGD'],
        #      'batch_size' : [32, 64, 128],
        #      'kernel_size': [3, 7]
        #     })

        # Parameters to vary:
        parm_names = list(hparms_spec.keys())

        # Iterate through list of value combinations:
        #     (0.001, 'Adam', 32, 3)
        #     (0.001, 'Adam', 32, 7)
        #     (0.001, 'Adam', 64, 3)
        #        ...
        # to get a list of dicts, each with a
        # unique combination of parameter settings:
        #
        #     [{'lr': 0.001,
        #       'optimizer'  : 'Adam',
        #       'batch_size' : 32,
        #       'kernel_size': 3},
        #       {'lr': 0.001,
        #        'optimizer'  : 'Adam',
        #        'batch_size' : 32,
        #        'kernel_size': 7},
        #       {...}
        #       ...
        #     ]

        hparms_permutations = []

        for _perm_num, ordered_vals_tuple in enumerate(
                product(*hparms_spec.values())):
            # Have something like:
            #   (0.001, 'Adam', 32, 3)
            # Separate dict for each combo:
            conf_dict = dict(zip(parm_names, ordered_vals_tuple))
            hparms_permutations.append(conf_dict)

        return hparms_permutations

    #------------------------------------
    # gen_configurations
    #-------------------

    def gen_configurations(self, config, config_dicts):
        '''
        Takes a list of dicts, and returns a list
        of NeuralNetConfig instances. Each dict
        contains one hyperparameter settings combination
        that is to be tested. Such as:
             [{'lr': 0.001,
               'optimizer': 'Adam',
               'batch_size': 32,
               'kernel_size': 3},
               {'lr': 0.001,
                'optimizer': 'Adam',
                'batch_size': 32,
                'kernel_size': 7},
               {...}
               ...
             ]

        Each return configuration is a copy of the
        config, modified for the respective
        hyperparameter settings. All other parts of
        the config are kept.
        
        :param config: a configuration with
            all settings; only the hyperparameter 
            settings will be modified
        :type config: NeuralNetConfig
        :param config_dicts: one dict of hyperparm-name : value
            for each process to run independently
        :type config_dicts: [{str : Any}]
        :return: list of configurations for the classifier
            script to run
        :rtype: [NeuralNetConfig]
        '''

        configs = []
        for conf_dict in config_dicts:
            conf_copy = config.copy()
            for param_name, val in conf_dict.items():
                conf_copy.add_neural_net_parm(param_name, val)
            configs.append(conf_copy)
        return configs

    #------------------------------------
    # obtain_world_map
    #-------------------

    def obtain_world_map(self, initial_config):
        try:
            self.world_map_path = initial_config.getpath(
                'Paths', 'world_map', relative_to=self.curr_dir)
        except KeyError:
            raise RuntimeError(
                f"Could not find entry for 'world_map' in initial config")

        self.world_map = self.read_world_map(self.world_map_path)
        # Ensure that this machine has an
        # entry in the world_map:
        try:
            # Get this machine's info (sub)dict:
            _my_world_info = self.world_map[self.hostname]
        except KeyError:
            raise ConfigError(
                f"World map file does not contain entry for this machine ({self.hostname})"
            )

        self.compute_landscape = {}
        gpu_landscape = self.build_compute_landscape(self.world_map)
        return gpu_landscape

    #------------------------------------
    # build_compute_landscape
    #-------------------

    def build_compute_landscape(self, world_map):
        '''
        # Using the world_map.json config file, build 
        # a dict self.gpu_landscape like this:
        #
        #    {'machine_name1' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    {'machine_name2' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    } 
        #
        # Also sets 
        #     o self.master_hostname, the hostname
        #       running the one process that coordinates all others.
        #     o self.WORLD_SIZE, number of GPUs used across all machines
        #     o self.my_gpus, the number of GPUs on this machine
        
        :param world_map:
        :type world_map:
        :return: information about how many GPUs are
            on each node
        :rtype: OrderedDict
        '''

        if not self.hostname in world_map.keys():
            raise ConfigError(
                f"World map does not contain an entry for this machine {self.hostname}"
            )

        # Go through the world map, machine (a.k.a. node)
        # one at a time, in alpha order of the machine
        # names to ensure all copies of this script
        # come to the same conclusions about ranks

        # Build gpu_landscape:
        #
        #    {'machine_name1' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    {'machine_name2' : {'start_rank'    : <int>,
        #                        'num_gpus'      : <int>,
        #                        'gpu_device_ids': [<int>,<int>,...]
        #    }
        #
        # The structure is an OrderedDict(), containing
        # machines alphabetically by name. This discipline
        # is required so that all copies of this launch script
        # (one copy per machine) arrive at the same ordering of
        # GPUs:

        gpu_landscape = OrderedDict({})

        machine_name = self.hostname
        machine_info = world_map[self.hostname]

        try:
            machine_gpus = machine_info['gpus']
        except KeyError:
            print("World map must include a 'gpus' entry; the value may be 0")

        gpu_landscape[machine_name] = {}
        gpu_landscape[machine_name]['num_gpus'] = machine_gpus

        # List of GPU numbers to use is optional
        # in world_maps:

        machine_gpus_to_use = machine_info.get('devices', None)

        if machine_gpus_to_use is None:
            # Use all GPUs on this machine:
            machine_gpus_to_use = list(range(machine_gpus))

        gpu_landscape[machine_name]['gpu_device_ids'] = machine_gpus_to_use

        # Add 1 process for the on this machine,
        # which will run on its CPU, b/c no GPUs
        # are available:
        self.WORLD_SIZE += machine_gpus if machine_gpus > 0 else 1

        self.my_gpus = gpu_landscape[self.hostname]['num_gpus']
        self.gpu_landscape = gpu_landscape
        return gpu_landscape

    #------------------------------------
    # read_world_map
    #-------------------

    def read_world_map(self, path):
        '''
        Read the JSON5 world map file, and 
        return a corresponding dict. JSON5
        allows something like:
        
        /*
            This is a block comment.
            Notice the lacking quote
            chars around the keys below.
            The are optional in JSON5
            
        */
        
        {quintus.stanford.edu : {
            "master" : Yes
            "gpus" : 2
         },
        
         quatro.stanford.edu  : {
             "gpus" : 2,
             "devices" : [1,2]
         }
        }
        
        BUT: JSON5 gets angry at dots in the 
             keys. 
        So we first read the file, and try to find 
        the machine names. We temporarily replace
        them with an acceptable marker, and then 
        convert back.
                
        :param path: path to world map file
        :type path: string
        '''
        dot_substitute = '___'

        try:
            # Read all the world map file lines:
            with open(path, 'r') as world_map_fd:
                tmp_world_map = world_map_fd.readlines()
        except IOError as e:
            raise IOError(f"World map file at {path} not found") from e

        # Replace occurrences of '.' with dot_substitute:
        new_text = []
        for line in tmp_world_map:
            new_text.append(line.replace('.', dot_substitute))

        # ... and make one string from all the lines:
        json_str = '\n'.join(new_text)

        try:
            # Hopefully, JSON5 will eat it now:
            world_map_almost = json5.loads(json_str)
        except JSONError as e:
            raise JSONError(
                f"World map file at {path} contains bad JSON") from e

        # Need to fix all the dot substitutions.
        # At this point the data structure is
        #    { <machine_name> : {spec_attr1 : val1,
        #                        spec_attr2 : val2,
        #                       }
        #    }

        # Fix the machine names first:
        mach_names_fixed = [
            machine_name.replace(dot_substitute, '.')
            for machine_name in world_map_almost.keys()
        ]

        machine_specs_fixed = []

        # Now dig into each of the nested machine spec
        # dicts, and fix attrs and values there:
        for spec in world_map_almost.values():
            # Spec is a dict nested inside the outer one:
            spec_fixed = {
                key.replace(dot_substitute, '.'): val.replace(
                    dot_substitute, '.') if isinstance(val, str) else val
                for key, val in spec.items()
            }
            machine_specs_fixed.append(spec_fixed)

        # Put it all together:
        world_map = {
            machine_name: spec_dict
            for machine_name, spec_dict in zip(mach_names_fixed,
                                               machine_specs_fixed)
        }

        return world_map

    #------------------------------------
    # run_configurations
    #-------------------

    def run_configurations(self, run_configs):
        '''
        Takes a list of run configuration that 
        specify the details of a training run 
        (lr, optimizer to use, etc.) Spawns
        independent training script processes, one 
        with each of the configurations.
        
        If fewer CPUs/GPUs are available than the
        number of configs in run_configs, waits for
        processes to finish, then launches more.
        
        Configs may take one of three forms:
            o File path to a config file
            o JSON string with all the config info
            o A NeuralNetConfig instance

        Use world_map.json to know how many, and which 
        GPUs this machine is to use.
        
        Each copy of the training script is told:
        
            o RANK         # The copy's sequence number, which is
                           # Unique within this machine (but not 
                           # currently across machines, as in in 
                           # distributed data parallel (DDP)
            o LOCAL_RANK   # Which of this machine's GPU to use (0-origin)
            o WORLD_SIZE   # How many GPUs are used on all machines together
            o GPUS_USED_THIS_MACHINE # Number of GPUs *used* on this
                                     # machine, according to the world_map.
                                     # (As opposed to number of GPUs that
                                     # exist on this machine.)

        :param run_configs: list of configurations. Each config
            may either be a JSON string, the file name of
            a config file, or a NeuralNetConfig instance
        :type run_configs: [str | NeuralNetConfig]
        :return 0 for success of all processes, else 1
        :rtype int
        '''

        gpu_ids_to_use = self.gpu_landscape[self.hostname]['gpu_device_ids']
        cpu_only = len(gpu_ids_to_use) == 0

        self.gpu_manager = GPUManager(gpu_ids_to_use)

        for config in run_configs:

            # Get next available GPU ID, waiting
            # for one to free up, if necessary:

            local_rank = self.gpu_manager.obtain_gpu()

            # Create a command that is fit for passing to
            # Popen; it will start one training script
            # process. The conditional expression accounts for
            # machine with no GPU (which will run on CPU):

            cmd = self.training_script_start_cmd(local_rank, config)

            # Copy stdin, and give the copy to the subprocess.
            # This enables the subprocess to ask user whether
            # to save training state in case of a cnt-C:
            newstdin = os.fdopen(os.dup(sys.stdin.fileno()))

            # Spawn one training script. Use psutil's
            # Popen instead of subprocess.Popen to get
            # the wait_procs() method on the resulting
            # process instances:

            process = psutil.Popen(
                cmd,
                stdin=newstdin,
                stdout=None,  # Script inherits this launch
                stderr=None  # ... script's stdout/stderr  
            )

            if cpu_only:
                process.wait()
                # CPU op is for debugging only;
                # Rebel right away if something
                # went wrong:
                if process.returncode != 0:
                    print("CPU job ran with errors; see log")
                    return
                continue

            # Associate process instance with
            # the configuration it was to run.

            self.gpu_manager.process_register(
                RunInfo(local_rank, process, config, cmd))

        # Launched all configurations; wait for
        # the last of them to be done:

        if cpu_only:
            print("CPU job(s) ran OK")
            return

        # Ask for GPUs until we accounted
        # for all that we were allowed to
        # use; that will be indication that
        # all processes finished:

        for _i in len(gpu_ids_to_use):
            self.gpu_manager.obtain_gpu()

        if not self.quiet:
            print(f"Node {self.hostname} {os.path.basename(sys.argv[0])}: " \
                  f"Processed {len(run_configs)} configurations")

        failed_processes = self.gpu_manager.failures()
        if len(failed_processes) > 0:
            print(
                f"Failures: {len(failed_processes)} (Check log for error entries):"
            )

            for failed_proc in failed_processes:
                failed_config = self.gpu_manager.process_info(failed_proc)
                train_script = self.training_script
                msg = (f"Training script {train_script}: {str(failed_config)}")
                print(msg)

    #------------------------------------
    # training_script_start_cmd
    #-------------------

    def training_script_start_cmd(self, local_rank, config):
        '''
        From provided information, creates a legal 
        command string for starting the training script.
        
        :param local_rank: GPU identifier (between 0 and 
            num of GPUs in this machine)
        :type local_rank: int
        :param config: additional information in a config instance,
            or a path to a configuration file
        :type config: {NeuralNetConfig | str}
        '''

        # Build the shell command line,
        # starting with 'python -u':
        cmd = [sys.executable, "-u", f"{self.training_script}"]

        # Add the 'secret' args that tell the training
        # script all the communication parameters:

        cmd.extend([
            f"--LOCAL_RANK={local_rank}",
            f"--WORLD_SIZE={self.WORLD_SIZE}",
        ])

        # Finally, the obligatory non-option arg
        # to the training script: the configuration.
        # Could be a file, a json string, or a
        # NeuralNetConfig instance:

        if isinstance(config, NeuralNetConfig):
            # Turn into a JSON str for communicating
            # to the script:
            config_arg = config.to_json()
            self.log.info(f"\nLAUNCHING TRAINING: " +\
                          f"{NeuralNetConfig.json_human_readable(config_arg)}")
        else:
            config_arg = config
            self.log.info(f"\nLAUNCHING TRAINING from file: {config_arg}")

        cmd.append(config_arg)

        #self.log.debug(f"****** Launch: the cmd is {cmd}")
        return cmd

# ------------------- Utils --------------

#------------------------------------
# handle_cnt_c
#-------------------

    def handle_cnt_c(self):
        '''
        Given a list of process instances,
        Send SIGINT (cnt-C) to them:
        :param procs:
        :type procs:
        '''

        if self.cnt_c_received:
            # Just quit after a second
            # cnt-c:
            print(
                f"Hard quit. May wish to check for stray {self.training_script} processes"
            )
            sys.exit(1)

        self.cnt_c_received = True
        for process in self.gpu_manager.process_list():
            # If process is no longer running,
            # forget about it:
            if process.poll is not None:
                # Process dead:
                continue
            process.send_signal(signal.SIGTERM)
            process.wait()

    #------------------------------------
    # am_master_node
    #-------------------

    def am_master_node(self):
        '''
        This method allows this script to stay somewhat
        close to the Distributed Data Parallel sibling
        launch_birds_parallel(). For this script,
        though, every process is its own master.
        '''
        return True

    #------------------------------------
    # is_json_str
    #-------------------

    def is_json_str(self, str_to_check):
        '''
        Very primitive test whether a passed-in
        string is (legal) JSON or not.
        
        :param str_to_check: string to examine
        :type str_to_check: str
        :return True/False
        :rtype bool
        '''
        try:
            json5.loads(str_to_check)
        except JSONError:
            return False
        return True