示例#1
0
    def test_visualize_epoch_testing_phase_only(self):
        
        tally0, _tally_val = self.make_tallies(testing=False)

        class_names = ['c1', 'c2']
        epoch = 0

        tally_coll = ResultCollection()
        tally0.phase = LearningPhase.TESTING
        tally_coll.add(tally0, epoch)
        

        TensorBoardPlotter.visualize_step(
            tally_coll,
            self.writer,
            [LearningPhase.TESTING],
            epoch,
            class_names
            )
        # o Should show only viz relevant
        #   to TESTING
        # o visualize_final_epoch_results should move
        #   to tensorboard_plotter
        # o report_hparams_summary should move
        #   to tensorboard_plotter
        
        self.await_user_ack(f"Should see 21 charts & a 2x2 conf matrix.\n" +\
                            "Hit key when inspected:")
示例#2
0
    def setUp(self):
        self.tally_collection = ResultCollection()
        self.num_classes = 4

        self.single_pred = torch.tensor([[0.1, 0.1, 0.5, 0.2]])

        # Label leading to batch correctly
        # predicted: target class 2
        self.single_label_matching = torch.tensor([2])

        # Label leading to batch badly predicted:
        # target class 3:
        self.single_label_non_match = torch.tensor([3])

        self.batch_pred = torch.tensor([[0.1, 0.1, 0.5, 0.2],
                                        [0.6, 0.3, 0.5, 0.1]])

        # Labels leading to both batches correctly
        # predicted: target class 2 for first row,
        # class 0 for second row:

        self.batch_label_matching = torch.tensor([2, 0])

        # Labels that lead to first batch correct,
        # second not:

        self.batch_label_non_match = torch.tensor([2, 1])

        # Larger batch:
        self.ten_results = torch.tensor(  # Label
            [
                [0.5922, 0.6546, 0.7172, 0.0139],  #   2
                [0.9124, 0.9047, 0.6819, 0.9329],  #   3
                [0.2345, 0.1733, 0.5420, 0.4659],  #   2
                [0.5954, 0.8958, 0.2294, 0.5529],  #   1
                [0.3861, 0.2918, 0.0972, 0.0548],  #   0
                [0.4647, 0.7002, 0.9632, 0.1320],  #   2
                [0.5064, 0.3124, 0.6235, 0.0118],  #   2
                [0.3487, 0.6241, 0.8620, 0.4953],  #   2
                [0.0386, 0.4663, 0.2362, 0.4898],  #   3
                [0.7019, 0.5001, 0.4052, 0.2223]
            ]  #   0
        )
        self.ten_labels_perfect = torch.tensor([2, 3, 2, 1, 0, 2, 2, 2, 3, 0])
        self.ten_labels_first_wrong = torch.tensor(
            [0, 3, 2, 1, 0, 2, 2, 2, 3, 0])
示例#3
0
    def test_copy(self):
        tally1 = self.tally_result(self.ten_labels_perfect,
                                   self.ten_results,
                                   LearningPhase.TRAINING,
                                   epoch=1)
        new_col = ResultCollection.create_from(self.tally_collection)

        # Contents of new collection should be same:
        self.assertEqual(len(new_col), 1)
        new_tally = list(new_col.tallies())[0]
        self.assertTrue(new_tally == tally1)

        for tally_old, tally_new in zip(self.tally_collection.tallies(),
                                        new_col.tallies()):
            self.assertTrue(tally_old == tally_new)
示例#4
0
    def setup_tensorboard(self, logdir, raw_data_dir=True):
        '''
        Initialize tensorboard. To easily compare experiments,
        use runs/exp1, runs/exp2, etc.
        
        Method creates the dir if needed.
        
        Additionally, sets self.csv_pred_writer and self.csv_label_writer
        to None, or open CSV writers, depending on the value of raw_data_dir,
        see create_csv_writer()
        
        :param logdir: root for tensorboard events
        :type logdir: str
        '''

        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        # For storing train/val preds/labels
        # for every epoch. Used to create charts
        # after run is finished:
        self.csv_writer = self.create_csv_writer(raw_data_dir)

        # Place to store intermediate models:
        self.model_archive = \
            self.create_model_archive(self.config,
                                      self.num_classes
                                      )

        # Use SummaryWriterPlus to avoid confusing
        # directory creations when calling add_hparams()
        # on the writer:

        self.writer = SummaryWriterPlus(log_dir=logdir)

        # Intermediate storage for train and val results:
        self.results = ResultCollection()

        self.log.info(
            f"To view tensorboard charts: in shell: tensorboard --logdir {logdir}; then browser: localhost:6006"
        )
示例#5
0
    def visualize_step(self, step):
        '''
        Take the ResultTally instances
        in the train and val ResultCollections
        in self.results, and report appropriate
        aggregates to tensorboard. Computes
        f1 scores, accuracies, etc. for given
        step.

        Separately for train and validation
        results: build one long array 
        of predictions, and a corresponding
        array of labels. Also, average the
        loss across all instances.
        
        The the preds and labels as rows to csv 
        files.

        '''

        val_tally = self.results[(step, str(LearningPhase.VALIDATING))]
        train_tally = self.results[(step, str(LearningPhase.TRAINING))]

        result_coll = ResultCollection()
        result_coll.add(val_tally, step)
        result_coll.add(train_tally, step)

        self.latest_result = {'train': train_tally, 'val': val_tally}

        # If we are to write preds and labels to
        # .csv for later additional processing:

        if self.csv_writer is not None:
            self.csv_writer.writerow([
                step, train_tally.preds, train_tally.labels, val_tally.preds,
                val_tally.labels
            ])

        TensorBoardPlotter.visualize_step(
            result_coll, self.writer,
            [LearningPhase.TRAINING, LearningPhase.VALIDATING], step,
            self.class_names)
        # History of learning rate adjustments:
        lr_this_epoch = self.optimizer.param_groups[0]['lr']
        self.writer.add_scalar('learning_rate',
                               lr_this_epoch,
                               global_step=step)
示例#6
0
    def __init__(self,
                 config_info,
                 device=0,
                 percentage=None,
                 debugging=False):
        '''
        
        :param config_info: all path and training parameters
        :type config_info: NeuralNetConfig
        :param debugging: output lots of debug info
        :type debugging: bool
        :param device: number of GPU to use; default is dev 0
            if any GPU is available
        :type device: {None | int}
        :param percentage: percentage of training data to 
            use
        :type percentage: {int | float}
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        if percentage is not None:
            # Integrity check:
            if type(percentage) not in [int, float]:
                raise TypeError(
                    f"Percentage must be int or float, not {type(percentage)}")
            if percentage < 1 or percentage > 100:
                raise ValueError(
                    f"Percentage must be between 1 and 100, not {percentage}")

        if device is None:
            device = 0
            torch.cuda.set_device(device)
        else:
            available_gpus = torch.cuda.device_count()
            if available_gpus == 0:
                self.log.info("No GPU available; running on CPU")
            else:
                if device > available_gpus - 1:
                    raise ValueError(
                        f"Asked to operate on device {device}, but only {available_gpus} are available"
                    )
                torch.cuda.set_device(device)

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.num_folds = self.config.Training.getint('num_folds')
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.device = self.fastest_device
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.initialize_model()

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)

        self.train_loader = self.get_dataloader(sample_width,
                                                sample_height,
                                                perc_data_to_use=percentage)
        self.log.info(f"Expecting {len(self.train_loader)} batches per epoch")
        num_train_samples = len(self.train_loader.dataset)
        num_classes = len(self.train_loader.dataset.class_names())
        self.log.info(
            f"Training set contains {num_train_samples} samples across {num_classes} classes"
        )

        self.class_names = self.train_loader.dataset.class_names()

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here: (num_folds * num-epochs)
        # each for training and validation steps.

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_step = self.train()
            self.visualize_final_epoch_results(final_step)
        finally:
            self.close_tensorboard()
示例#7
0
class BirdsBasicTrainerCV:
    '''
    classdocs
    '''
    # Number of intermediate models to save
    # during training:

    MODEL_ARCHIVE_SIZE = 20

    # For some tensorboard displays:
    # for how many epochs in the past
    # to display data:

    DISPLAY_HISTORY_LEN = 10

    #------------------------------------
    # Constructor
    #-------------------

    def __init__(self,
                 config_info,
                 device=0,
                 percentage=None,
                 debugging=False):
        '''
        
        :param config_info: all path and training parameters
        :type config_info: NeuralNetConfig
        :param debugging: output lots of debug info
        :type debugging: bool
        :param device: number of GPU to use; default is dev 0
            if any GPU is available
        :type device: {None | int}
        :param percentage: percentage of training data to 
            use
        :type percentage: {int | float}
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        if percentage is not None:
            # Integrity check:
            if type(percentage) not in [int, float]:
                raise TypeError(
                    f"Percentage must be int or float, not {type(percentage)}")
            if percentage < 1 or percentage > 100:
                raise ValueError(
                    f"Percentage must be between 1 and 100, not {percentage}")

        if device is None:
            device = 0
            torch.cuda.set_device(device)
        else:
            available_gpus = torch.cuda.device_count()
            if available_gpus == 0:
                self.log.info("No GPU available; running on CPU")
            else:
                if device > available_gpus - 1:
                    raise ValueError(
                        f"Asked to operate on device {device}, but only {available_gpus} are available"
                    )
                torch.cuda.set_device(device)

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.num_folds = self.config.Training.getint('num_folds')
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.device = self.fastest_device
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.initialize_model()

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)

        self.train_loader = self.get_dataloader(sample_width,
                                                sample_height,
                                                perc_data_to_use=percentage)
        self.log.info(f"Expecting {len(self.train_loader)} batches per epoch")
        num_train_samples = len(self.train_loader.dataset)
        num_classes = len(self.train_loader.dataset.class_names())
        self.log.info(
            f"Training set contains {num_train_samples} samples across {num_classes} classes"
        )

        self.class_names = self.train_loader.dataset.class_names()

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here: (num_folds * num-epochs)
        # each for training and validation steps.

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_step = self.train()
            self.visualize_final_epoch_results(final_step)
        finally:
            self.close_tensorboard()

    #------------------------------------
    # train
    #-------------------

    def train(self):

        overall_start_time = datetime.datetime.now()
        # Just for sanity: keep track
        # of number of batches...
        total_batch_num = 0

        # Note: since we are cross validating, the
        # data loader's set_epoch() method is only
        # called once (automatically) during instantiation
        # of the associated sampler. Moving from split
        # to split includes shuffling if the caller
        # specified that.

        # Training
        for split_num in range(self.train_loader.num_folds):

            split_start_time = datetime.datetime.now()
            self.initialize_model()
            for epoch in range(self.max_epochs):

                # Set model to train mode:
                self.model.train()

                epoch_start_time = datetime.datetime.now()

                self.log.info(f"Starting epoch {epoch} training")

                # Sanity check record: will record
                # how many samples from each class were
                # used:
                self.class_coverage = {}

                # Sanity records: will record number
                # of samples of each class that are used
                # during training and validation:
                label_distrib = {}
                batch_num = 0

                self.log.info(
                    f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}"
                )
                try:
                    for batch, targets in self.train_loader:
                        # Update the sanity check
                        # num of batches seen, and distribution
                        # of samples across classes:
                        batch_num += 1
                        total_batch_num += 1

                        # Update sanity check records:
                        for lbl in targets:
                            lbl = int(lbl)
                            try:
                                label_distrib[lbl] += 1
                            except KeyError:
                                label_distrib[lbl] = 1
                            try:
                                self.class_coverage[lbl]['train'] += 1
                            except KeyError:
                                self.class_coverage[lbl] = {
                                    'train': 1,
                                    'val': 0
                                }

                        self.log.debug(
                            f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')

                        outputs = self.model(images)
                        loss = self.loss_fn(outputs, labels)
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

                        # Remember the last batch's train result of this
                        # split (results for earlier batches of
                        # the same split will be overwritten). This statement
                        # must sit before deleting output and labels:

                        step_num = self.step_number(epoch, split_num,
                                                    self.num_folds)
                        self.remember_results(LearningPhase.TRAINING, step_num,
                                              outputs, labels, loss)

                        self.log.debug(
                            f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(images, 'cpu')
                        outputs = FileUtils.to_device(outputs, 'cpu')
                        labels = FileUtils.to_device(labels, 'cpu')
                        loss = FileUtils.to_device(loss, 'cpu')

                        del images
                        del outputs
                        del labels
                        del loss
                        torch.cuda.empty_cache()

                        self.log.debug(
                            f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )
                except EndOfSplit:

                    end_time = datetime.datetime.now()
                    train_time_duration = end_time - epoch_start_time
                    # A human readable duration st down to minutes:
                    duration_str = FileUtils.time_delta_str(
                        train_time_duration, granularity=4)

                    self.log.info(
                        f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})"
                    )

                    #***********
                    #print(f"****** num_batches in split: {batch_num}" )
                    #print(f"****** LblDist: {label_distrib}")
                    #***********
                    self.validate_split(step_num)
                    self.visualize_step(step_num)
                    # Save model, keeping self.model_archive_size models:
                    self.model_archive.save_model(self.model, epoch)

                    self.log.debug(
                        f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                    )

                    # Next Epoch
                    continue

            end_time = datetime.datetime.now()
            train_time_duration = end_time - split_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(train_time_duration,
                                                    granularity=4)

            self.log.info(
                f"Done training split {split_num} (duration: {duration_str})")

            # Next split
            continue

        end_time = datetime.datetime.now()
        epoch_duration = end_time - epoch_start_time
        epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4)

        cumulative_dur = end_time - overall_start_time
        cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4)

        msg = f"Done epoch {epoch}  (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})"
        self.log.info(msg)

        #******self.scheduler.step()

        # Fresh results tallying
        #self.results.clear()

        self.log.info(
            f"Training complete after {self.train_loader.num_folds} splits")

        # Report the sanity checks:
        self.log.info(f"Total batches processed: {total_batch_num}")
        for cid in self.class_coverage.keys():
            train_use, val_use = self.class_coverage[cid].items()
            self.log.info(
                f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}"
            )

        # All seems to have gone well. Report the
        # overall result of the final epoch for the
        # hparms config used in this process:

        self.report_hparams_summary(self.latest_result)

        # The final epoch number:
        return epoch

    #------------------------------------
    # validate_split
    #-------------------

    def validate_split(self, step):
        '''
        Validate one split, using that split's 
        validation fold. Return time taken. Record
        results for tensorboard and other record keeping.
        
        :param step: current combination of epoch and 
            split
        :type step: int
        :return: number of epoch seconds needed for the validation
        :rtype: int
        '''
        # Validation

        self.log.debug(
            f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        start_time = datetime.datetime.now()
        self.log.info(f"Starting validation for step {step}")

        self.model.eval()
        with torch.no_grad():
            for img_tensor, target in self.train_loader.validation_samples():
                expanded_img_tensor = unsqueeze(img_tensor, dim=0)
                expanded_target = unsqueeze(target, dim=0)

                # Update sanity record:
                self.class_coverage[int(target)]['val'] += 1

                images = FileUtils.to_device(expanded_img_tensor, 'gpu')
                label = FileUtils.to_device(expanded_target, 'gpu')

                outputs = self.model(images)
                loss = self.loss_fn(outputs, label)

                images = FileUtils.to_device(images, 'cpu')
                outputs = FileUtils.to_device(outputs, 'cpu')
                label = FileUtils.to_device(label, 'cpu')
                loss = FileUtils.to_device(loss, 'cpu')

                self.remember_results(LearningPhase.VALIDATING, step, outputs,
                                      label, loss)
                del images
                del outputs
                del label
                del loss
                torch.cuda.empty_cache()

        end_time = datetime.datetime.now()
        val_time_duration = end_time - start_time
        # A human readable duration st down to minues:
        duration_str = FileUtils.time_delta_str(val_time_duration,
                                                granularity=4)
        self.log.info(f"Done validation (duration: {duration_str})")

        return val_time_duration

    # ------------- Utils -----------

    #------------------------------------
    # report_acc_loss
    #-------------------

    def report_acc_loss(self, phase, epoch, accumulated_loss):

        self.writer.add_scalar(f"loss/{phase}", accumulated_loss, epoch)

    #------------------------------------
    # remember_results
    #-------------------

    def remember_results(
        self,
        phase,
        step,
        outputs,
        labels,
        loss,
    ):

        # Add the results
        tally = ResultTally(step, phase, outputs, labels, loss,
                            self.num_classes, self.batch_size)
        # Add result to intermediate results collection of
        # tallies:
        self.results[step] = tally

        # Same with the session-wide
        # collection:

        self.step_results.add(tally)

    #------------------------------------
    # visualize_step
    #-------------------

    def visualize_step(self, step):
        '''
        Take the ResultTally instances
        in the train and val ResultCollections
        in self.results, and report appropriate
        aggregates to tensorboard. Computes
        f1 scores, accuracies, etc. for given
        step.

        Separately for train and validation
        results: build one long array 
        of predictions, and a corresponding
        array of labels. Also, average the
        loss across all instances.
        
        The preds and labels as rows to csv 
        files.

        '''

        val_tally = self.results[(step, str(LearningPhase.VALIDATING))]
        train_tally = self.results[(step, str(LearningPhase.TRAINING))]

        result_coll = ResultCollection()
        result_coll.add(val_tally, step)
        result_coll.add(train_tally, step)

        self.latest_result = {'train': train_tally, 'val': val_tally}

        # If we are to write preds and labels to
        # .csv for later additional processing:

        if self.csv_writer is not None:
            self.csv_writer.writerow([
                step, train_tally.preds, train_tally.labels, val_tally.preds,
                val_tally.labels
            ])

        TensorBoardPlotter.visualize_step(
            result_coll, self.writer,
            [LearningPhase.TRAINING, LearningPhase.VALIDATING], step,
            self.class_names)
        # History of learning rate adjustments:
        lr_this_step = self.optimizer.param_groups[0]['lr']
        self.writer.add_scalar('learning_rate', lr_this_step, global_step=step)

    #------------------------------------
    # visualize_final_epoch_results
    #-------------------

    def visualize_final_epoch_results(self, epoch):
        '''
        Reports to tensorboard just for the
        final epoch.
 
        Expect self.latest_result to be the latest
        ResultTally.
        '''
        # DISPLAY_HISTORY_LEN holds the number
        # of historic epochs we will show. Two
        # results per epochs --> need
        # 2*DISPLAY_HISTORY_LEN results. But check
        # that there are that many, and show fewer
        # if needed:

        num_res_to_show = min(len(self.step_results),
                              2 * self.DISPLAY_HISTORY_LEN)

        f1_hist = self.step_results[-num_res_to_show:]

        # First: the table of train and val f1-macro
        # scores for the past few epochs:
        #
        #      |phase|ep0  |ep1 |ep2 |
        #      |-----|-----|----|----|
        #      |train| f1_0|f1_1|f1_2|
        #      |  val| f1_0|f1_1|f1_2|

        f1_macro_tbl = TensorBoardPlotter.make_f1_train_val_table(f1_hist)
        self.writer.add_text('f1/history', f1_macro_tbl)

        # Now, in the same tensorboard row: the
        # per_class train/val f1 scores for each
        # class separately:
        #
        # |class|weighted mean f1 train|weighted mean f1 val|
        # |-----|----------------------|--------------------|
        # |  c1 |0.1                   |0.6                 |
        # |  c2 |0.1                   |0.6                 |
        # |  c3 |0.1                   |0.6                 |
        # ------|----------------------|--------------------|

        f1_all_classes = TensorBoardPlotter.make_all_classes_f1_table(
            self.latest_result, self.class_names)
        self.writer.add_text('f1/per-class', f1_all_classes)

    #------------------------------------
    # report_hparams_summary
    #-------------------

    def report_hparams_summary(self, latest_result):
        '''
        Called at the end of training. Constructs
        a summary to report for the hyperparameters
        used in this process. Reports to the tensorboard.
         
        Hyperparameters reported:
         
           o lr
           o optimizer
           o batch_size
           o kernel_size
         
        Included in the measures are:
         
           o balanced_accuracy      (train and val)
           o mean_accuracy_train    (train and val)
           o epoch_prec_weighted
           o epoch_recall_weighted
           o epoch_mean_loss        (train and val)
           
         
        :param latest_result: dict with keys 'train' and
            'val', holding the respective most recent
            (i.e. last-epoch) ResultTally
        :type latest_result: {'train' : ResultTally,
                               'val'   : ResultTally
                               }
        '''

        # Get the latest validation tally:
        train_tally = latest_result['train']
        val_tally = latest_result['val']

        hparms_vals = OrderedDict({
            'net':
            self.net_name,
            'pretrained':
            f"{self.pretrained}",
            'lr_initial':
            self.config.Training.lr,
            'optimizer':
            self.config.Training.opt_name,
            'batch_size':
            self.config.getint('Training', 'batch_size'),
            'kernel_size':
            self.config.getint('Training', 'kernel_size'),
            'to_grayscale':
            self.to_grayscale
        })

        metric_results = {
            'zz_balanced_adj_acc_train': train_tally.balanced_acc,
            'zz_balanced_adj_acc_val': val_tally.balanced_acc,
            'zz_acc_train': train_tally.accuracy,
            'zz_acc_val': val_tally.accuracy,
            'zz_epoch_weighted_prec': val_tally.prec_weighted,
            'zz_epoch_weighted_recall': val_tally.recall_weighted,
            'zz_epoch_mean_loss_train': train_tally.mean_loss,
            'zz_epoch_mean_loss_val': val_tally.mean_loss
        }

        self.writer.add_hparams(hparms_vals, metric_results)

    #------------------------------------
    # get_dataloader
    #-------------------

    def get_dataloader(self,
                       sample_width,
                       sample_height,
                       perc_data_to_use=None):
        '''
        Returns a cross validating dataloader. 
        If perc_data_to_use is None, all samples
        under self.root_train_test_data will be
        used for training. Else percentage indicates
        the percentage of those samples to use. The
        selection is random.
        
        :param sample_width: pixel width of returned images
        :type sample_width: int
        :param sample_height: pixel height of returned images
        :type sample_height: int
        :param perc_data_to_use: amount of available training
            data to use.
        :type perc_data_to_use: {None | int | float}
        :return: a data loader that serves batches of
            images and their assiated labels
        :rtype: CrossValidatingDataLoader
        '''

        data_root = self.root_train_test_data

        train_dataset = SingleRootImageDataset(data_root,
                                               sample_width=sample_width,
                                               sample_height=sample_height,
                                               percentage=perc_data_to_use,
                                               to_grayscale=True)

        sampler = SKFSampler(train_dataset,
                             num_folds=self.num_folds,
                             seed=42,
                             shuffle=True,
                             drop_last=True)

        train_loader = CrossValidatingDataLoader(train_dataset,
                                                 batch_size=self.batch_size,
                                                 shuffle=True,
                                                 drop_last=True,
                                                 sampler=sampler,
                                                 num_folds=self.num_folds)
        return train_loader

    #------------------------------------
    # initialize_model
    #-------------------

    def initialize_model(self):
        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)

    #------------------------------------
    # find_num_classes
    #-------------------

    def find_num_classes(self, data_root):
        '''
        Expect two subdirectories under data_root:
        train and validation. Underneath each are 
        further subdirectories whose names are the
        classes:
        
                train               validation
        class1 class2 class3     class1 class2 class3
          imgs   imgs   imgs       imgs   imgs   imgs
        
        No error checking to confirm this structure
        
        :param data_root: path to parent of train/validation
        :type data_root: str
        :return: number of unique classes as obtained
            from the directory names
        :rtype: int
        '''
        self.classes = FileUtils.find_class_names(data_root)
        return len(self.classes)

    #------------------------------------
    # setup_tensorboard
    #-------------------

    def setup_tensorboard(self, logdir, raw_data_dir=True):
        '''
        Initialize tensorboard. To easily compare experiments,
        use runs/exp1, runs/exp2, etc.
        
        Method creates the dir if needed.
        
        Additionally, sets self.csv_pred_writer and self.csv_label_writer
        to None, or open CSV writers, depending on the value of raw_data_dir,
        see create_csv_writer()
        
        :param logdir: root for tensorboard events
        :type logdir: str
        '''

        if not os.path.isdir(logdir):
            os.makedirs(logdir)

        # For storing train/val preds/labels
        # for every epoch. Used to create charts
        # after run is finished:
        self.csv_writer = self.create_csv_writer(raw_data_dir)

        # Place to store intermediate models:
        self.model_archive = \
            self.create_model_archive(self.config,
                                      self.num_classes
                                      )

        # Use SummaryWriterPlus to avoid confusing
        # directory creations when calling add_hparams()
        # on the writer:

        self.writer = SummaryWriterPlus(log_dir=logdir)

        # Intermediate storage for train and val results:
        self.results = ResultCollection()

        self.log.info(
            f"To view tensorboard charts: in shell: tensorboard --logdir {logdir}; then browser: localhost:6006"
        )

    #------------------------------------
    # create_csv_writer
    #-------------------

    def create_csv_writer(self, raw_data_dir):
        '''
        Create a csv_writer that will fill a csv
        file during training/validation as follows:
        
            epoch  train_preds   train_labels  val_preds  val_labels
            
        Cols after the integer 'epoch' col will each be
        an array of ints:
        
                  train_preds    train_lbls   val_preds  val_lbls
                2,"[2,5,1,2,3]","[2,6,1,2,1]","[1,2]",    "[1,3]" 
        
        If raw_data_dir is provided as a str, it is
        taken as the directory where csv file with predictions
        and labels are to be written. The dir is created if necessary.
         
        If the arg is instead set to True, a dir 'runs_raw_results' is
        created under this script's directory if it does not
        exist. Then a subdirectory is created for this run,
        using the hparam settings to build a file name. The dir
        is created if needed. Result ex.:
        
              <script_dir>
                   runs_raw_results
                       Run_lr_0.001_br_32
                           run_2021_05_ ... _lr_0.001_br_32.csv
        
        
        Then file name is created, again from the run
        hparam settings. If this file exists, user is asked whether
        to remove or append. The inst var self.csv_writer is
        initialized to:
        
           o None if csv file exists, but is not to 
             be overwritten nor appended-to
           o A filed descriptor for a file open for either
             'write' or 'append.
        
        :param raw_data_dir: If simply True, create dir and file names
            from hparams, and create as needed. If a string, it is 
            assumed to be the directory where a .csv file is to be
            created. If None, self.csv_writer is set to None.
        :type raw_data_dir: {None | True | str|
        :return: CSV writer ready for action. Set either to
            write a fresh file, or append to an existing file.
            Unless file exists, and user decided not to overwrite
        :rtype: {None | csv.writer}
        '''

        # Ensure the csv file root dir exists if
        # we'll do a csv dir and run-file below it:

        if type(raw_data_dir) == str:
            raw_data_root = raw_data_dir
        else:
            raw_data_root = os.path.join(self.curr_dir, 'runs_raw_results')

        if not os.path.exists(raw_data_root):
            os.mkdir(raw_data_root)

        # Can rely on raw_data_root being defined and existing:

        if raw_data_dir is None:
            return None

        # Create both a raw dir sub-directory and a .csv file
        # for this run:
        csv_subdir_name = FileUtils.construct_filename(self.config.Training,
                                                       prefix='Run',
                                                       incl_date=True)
        os.makedirs(csv_subdir_name)

        # Create a csv file name:
        csv_file_nm = FileUtils.construct_filename(self.config.Training,
                                                   prefix='run',
                                                   suffix='.csv',
                                                   incl_date=True)

        csv_path = os.path.join(raw_data_root, csv_file_nm)

        # Get csv_raw_fd appropriately:

        if os.path.exists(csv_path):
            do_overwrite = FileUtils.user_confirm(
                f"File {csv_path} exists; overwrite?", default='N')
            if not do_overwrite:
                do_append = FileUtils.user_confirm(f"Append instead?",
                                                   default='N')
                if not do_append:
                    return None
                else:
                    mode = 'a'
        else:
            mode = 'w'

        csv_writer = CSVWriterCloseable(csv_path, mode=mode, delimiter=',')

        header = [
            'epoch', 'train_preds', 'train_labels', 'val_preds', 'val_labels'
        ]
        csv_writer.writerow(header)

        return csv_writer

    #------------------------------------
    # create_model_archive
    #-------------------

    def create_model_archive(self, config, num_classes):
        '''
        Creates facility for saving partially trained
        models along the way.
        
        :param config:
        :type config:
        :param num_classes:
        :type num_classes:
        :return: ModelArchive instance ready
            for calls to save_model()
        :rtype: ModelArchive
        '''
        model_archive = ModelArchive(config,
                                     num_classes,
                                     history_len=self.MODEL_ARCHIVE_SIZE,
                                     log=self.log)
        return model_archive

    #------------------------------------
    # close_tensorboard
    #-------------------

    def close_tensorboard(self):
        if self.csv_writer is not None:
            try:
                self.csv_writer.close()
            except Exception as e:
                self.log.warn(f"Could not close csv file: {repr(e)}")
        try:
            self.writer.close()
        except AttributeError:
            self.log.warn(
                "Method close_tensorboard() called before setup_tensorboard()?"
            )
        except Exception as e:
            raise RuntimeError(
                f"Problem closing tensorboard: {repr(e)}") from e

    #------------------------------------
    # get_optimizer
    #-------------------

    def get_optimizer(self, optimizer_name, model, lr):

        optimizer_name = optimizer_name.lower()
        if optimizer_name == 'adam':
            optimizer = optim.Adam(model.parameters(),
                                   lr=lr,
                                   eps=1e-3,
                                   amsgrad=True)
            return optimizer

        if optimizer_name == 'sgd':
            optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
            return optimizer

        if optimizer_name == 'rmsprop':
            optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=0.9)
            return optimizer

        raise ValueError(f"Optimizer {optimizer_name} not supported")

    #------------------------------------
    # initialize_config_struct
    #-------------------

    def initialize_config_struct(self, config_info):
        '''
        Initialize a config dict of dict with
        the application's configurations. Sections
        will be:
        
          config['Paths']       -> dict[attr : val]
          config['Training']    -> dict[attr : val]
          config['Parallelism'] -> dict[attr : val]
        
        The config read method will handle config_info
        being None. 
        
        If config_info is a string, it is assumed either 
        to be a file containing the configuration, or
        a JSON string that defines the config.
         
        Else config_info is assumed to be a NeuralNetConfig.
        The latter is relevant only if using this file
        as a library, rather than a command line tool.
        
        If given a NeuralNetConfig instance, it is returned
        unchanged. 
        
        :param config_info: the information needed to construct
            the structure
        :type config_info: {NeuralNetConfig | str}
        :return a NeuralNetConfig instance with all parms
            initialized
        :rtype NeuralNetConfig
        '''

        if isinstance(config_info, str):
            # Is it a JSON str? Should have a better test!
            if config_info.startswith('{'):
                # JSON String:
                config = NeuralNetConfig.from_json(config_info)
            else:
                config = self.read_configuration(config_info)
        elif isinstance(config_info, NeuralNetConfig):
            config = config_info
        else:
            msg = f"Error: must have a config file, not {config_info}. See config.cfg.Example in project root"
            # Since logdir may be in config, need to use print here:
            print(msg)
            raise ConfigError(msg)

        return config

    #------------------------------------
    # read_configuration
    #-------------------

    def read_configuration(self, conf_file):
        '''
        Parses config file that describes training parameters,
        various file paths, and how many GPUs different machines have.
        Syntax follows Python's configfile package, which includes
        sections, and attr/val pairs in each section.
        
        Expected sections:

           o Paths: various file paths for the application
           o Training: holds batch sizes, number of epochs, etc.
           o Parallelism: holds number of GPUs on different machines
        
        For Parallelism, expect entries like:
        
           foo.bar.com  = 4
           127.0.0.1    = 5
           localhost    = 3
           172.12.145.1 = 6
           
        Method identifies which of the entries is
        'localhost' by comparing against local hostname.
        Though 'localhost' or '127.0.0.1' may be provided.
        
        Returns a dict of dicts: 
            config[section-names][attr-names-within-section]
            
        Types of standard entries, such as epochs, batch_size,
        etc. are coerced, so that, e.g. config['Training']['epochs']
        will be an int. Clients may add non-standard entries.
        For those the client must convert values from string
        (the type in which values are stored by default) to the
        required type. This can be done the usual way: int(...),
        or using one of the configparser's retrieval methods
        getboolean(), getint(), and getfloat():
        
            config['Training'].getfloat('learning_rate')
        
        :param other_gpu_config_file: path to configuration file
        :type other_gpu_config_file: str
        :return: a dict of dicts mirroring the config file sections/entries
        :rtype: dict[dict]
        :raises ValueErr
        :raises TypeError
        '''

        if conf_file is None:
            return self.init_defaults()

        config = DottableConfigParser(conf_file)

        if len(config.sections()) == 0:
            # Config file exists, but empty:
            return (self.init_defaults(config))

        # Do type conversion also in other entries that
        # are standard:

        types = {
            'epochs': int,
            'batch_size': int,
            'kernel_size': int,
            'sample_width': int,
            'sample_height': int,
            'seed': int,
            'pytorch_comm_port': int,
            'num_pretrained_layers': int,
            'root_train_test_data': str,
            'net_name': str,
        }
        for section in config.sections():
            for attr_name in config[section].keys():
                try:
                    str_val = config[section][attr_name]
                    required_type = types[attr_name]
                    config[section][attr_name] = required_type(str_val)
                except KeyError:
                    # Current attribute is not standard;
                    # users of the corresponding value need
                    # to do their own type conversion when
                    # accessing this configuration entry:
                    continue
                except TypeError:
                    raise ValueError(
                        f"Config file error: {section}.{attr_name} should be convertible to {required_type}"
                    )

        return config

    #------------------------------------
    # set_seed
    #-------------------

    def set_seed(self, seed):
        '''
        Set the seed across all different necessary platforms
        to allow for comparison of different models and runs
        
        :param seed: random seed to set for all random num generators
        :type seed: int
        '''
        torch.manual_seed(seed)
        cuda.manual_seed_all(seed)
        # Not totally sure what these two do!
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)

    #------------------------------------
    # time_delta_str
    #-------------------

    def time_delta_str(self, epoch_delta, granularity=2):
        '''
        Takes the difference between two datetime times:
        
               start_time = datetime.datetime.now()
               <some time elapses>
               end_time = datetime.datetime.now()
               
               delta = end_time - start_time
               time_delta_str(delta
        
        Depending on granularity, returns a string like:
        
            Granularity:
                      1  '160.0 weeks'
                      2  '160.0 weeks, 4.0 days'
                      3  '160.0 weeks, 4.0 days, 6.0 hours'
                      4  '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes'
                      5  '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes, 13.0 seconds'
        
            For smaller time deltas, such as 10 seconds,
            does not include leading zero times. For
            any granularity:
            
                          '10.0 seconds'

            If duration is less than second, returns '< 1sec>'
            
        :param epoch_delta:
        :type epoch_delta:
        :param granularity:
        :type granularity:
        '''
        intervals = (
            ('weeks', 604800),  # 60 * 60 * 24 * 7
            ('days', 86400),  # 60 * 60 * 24
            ('hours', 3600),  # 60 * 60
            ('minutes', 60),
            ('seconds', 1),
        )
        secs = epoch_delta.total_seconds()
        result = []
        for name, count in intervals:
            value = secs // count
            if value:
                secs -= value * count
                if value == 1:
                    name = name.rstrip('s')
                result.append("{} {}".format(value, name))
        dur_str = ', '.join(result[:granularity])
        if len(dur_str) == 0:
            dur_str = '< 1sec>'
        return dur_str

    #------------------------------------
    # step_number
    #-------------------

    def step_number(self, epoch, split_num, num_folds):
        '''
        Combines an epoch with a split number into 
        a single integer series as epochs increase,
        and split_num cycles from 0 to num_folds.
        
        :param epoch: epoch to encode
        :type epoch: int
        :param split_num: split number to encode
        :type split_num: int
        :param num_folds: number of folds for CV splitting
            must be contant!
        :type num_folds: int
        :return: an integer the combines epoch and split-num
        :rtype: int
        '''

        step_num = epoch * num_folds + split_num
        return step_num

    #------------------------------------
    # cleanup
    #-------------------

    def cleanup(self):
        '''
        Recover resources taken by collaborating
        processes. OK to call multiple times.
        '''
        # self.clear_gpu()

        try:
            self.writer.close()
        except Exception as e:
            self.log.err(f"Could not close tensorboard writer: {repr(e)}")
示例#8
0
    def test_visualize_epoch_train_plus_val(self):
        
        tally0, _tally_val = self.make_tallies(testing=False)

        tally_coll = ResultCollection()

        f1_macro = tally0.f1_macro
        f1_micro = tally0.f1_micro
        f1_weighted = tally0.f1_weighted
        preds   = tally0.preds  # 2 predictions
        labels  = tally0.labels # 2 labels
        
        # For testing, prepare 
        # a train and a val tally
        # each for epochs 0 and 1.
        # visualize_step will need
        # to only show results for
        # one of the epochs:

        
        for epoch,phase in zip([0,0,1,1], 
                               [LearningPhase.TRAINING,
                                LearningPhase.VALIDATING,
                                LearningPhase.TRAINING,
                                LearningPhase.VALIDATING
                                ]):

            f1_macro += 0.1
            f1_micro += 0.1
            f1_weighted += 0.1
            
            new_tally = self.clone_tally(tally0, 
                                         f1_macro=f1_macro,
                                         f1_micro=f1_micro,
                                         f1_weighted=f1_weighted,
                                         phase=phase,
                                         preds=preds,
                                         labels=labels,
                                         epoch=epoch
                                         )
            new_tally.accuracy += \
                new_tally.accuracy * 10**(-1/(epoch+1))
            new_tally.balanced_acc += \
                0.1 + 10**(-1/(epoch+1))
            
            tally_coll.add(new_tally, epoch=epoch)

        class_names = ['c1', 'c2']
        TensorBoardPlotter.visualize_step(
            tally_coll,
            self.writer,
            [LearningPhase.TRAINING, LearningPhase.VALIDATING],
            0,
            class_names
            )
        TensorBoardPlotter.visualize_step(
            tally_coll,
            self.writer,
            [LearningPhase.TRAINING, LearningPhase.VALIDATING],
            1,
            class_names
            )

        self.await_user_ack(f"Should see 21 charts & a 2x2 conf matrix.\n" +\
                            "Hit key when inspected:")
示例#9
0
class Test(unittest.TestCase):

    #------------------------------------
    # setUp
    #-------------------

    def setUp(self):
        self.tally_collection = ResultCollection()
        self.num_classes = 4

        self.single_pred = torch.tensor([[0.1, 0.1, 0.5, 0.2]])

        # Label leading to batch correctly
        # predicted: target class 2
        self.single_label_matching = torch.tensor([2])

        # Label leading to batch badly predicted:
        # target class 3:
        self.single_label_non_match = torch.tensor([3])

        self.batch_pred = torch.tensor([[0.1, 0.1, 0.5, 0.2],
                                        [0.6, 0.3, 0.5, 0.1]])

        # Labels leading to both batches correctly
        # predicted: target class 2 for first row,
        # class 0 for second row:

        self.batch_label_matching = torch.tensor([2, 0])

        # Labels that lead to first batch correct,
        # second not:

        self.batch_label_non_match = torch.tensor([2, 1])

        # Larger batch:
        self.ten_results = torch.tensor(  # Label
            [
                [0.5922, 0.6546, 0.7172, 0.0139],  #   2
                [0.9124, 0.9047, 0.6819, 0.9329],  #   3
                [0.2345, 0.1733, 0.5420, 0.4659],  #   2
                [0.5954, 0.8958, 0.2294, 0.5529],  #   1
                [0.3861, 0.2918, 0.0972, 0.0548],  #   0
                [0.4647, 0.7002, 0.9632, 0.1320],  #   2
                [0.5064, 0.3124, 0.6235, 0.0118],  #   2
                [0.3487, 0.6241, 0.8620, 0.4953],  #   2
                [0.0386, 0.4663, 0.2362, 0.4898],  #   3
                [0.7019, 0.5001, 0.4052, 0.2223]
            ]  #   0
        )
        self.ten_labels_perfect = torch.tensor([2, 3, 2, 1, 0, 2, 2, 2, 3, 0])
        self.ten_labels_first_wrong = torch.tensor(
            [0, 3, 2, 1, 0, 2, 2, 2, 3, 0])

    #------------------------------------
    # tearDown
    #-------------------

    def tearDown(self):
        pass

    #------------------------------------
    # test_basics_single_split
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_basics_single_split(self):
        tally = self.tally_result(self.single_label_matching, self.single_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.epoch, 1)
        # self.assertEqual(tally.num_samples, 1)
        # self.assertEqual(tally.num_correct, 1)
        # self.assertEqual(tally.num_wrong, 0)

        tally = self.tally_result(self.single_label_non_match,
                                  self.single_pred, LearningPhase.TRAINING)
        self.assertEqual(tally.epoch, 1)
        # self.assertEqual(tally.num_samples, 1)
        # self.assertEqual(tally.num_correct, 0)
        # self.assertEqual(tally.num_wrong, 1)

    #------------------------------------
    # test_basics_two_splits
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_basics_two_splits(self):
        tally = self.tally_result(self.batch_label_matching, self.batch_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.epoch, 1)
        # self.assertEqual(tally.num_samples, 2)
        # self.assertEqual(tally.num_correct, 2)
        # self.assertEqual(tally.num_wrong, 0)

        tally = self.tally_result(self.batch_label_non_match, self.batch_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.epoch, 1)
        # self.assertEqual(tally.num_samples, 2)
        # self.assertEqual(tally.num_correct, 1)
        # self.assertEqual(tally.num_wrong, 1)

    #------------------------------------
    # test_accuracy
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_accuracy(self):
        # Single split, correct prediction
        tally = self.tally_result(self.single_label_matching, self.single_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.accuracy, 1)

        # Single split, incorrect prediction
        tally = self.tally_result(self.single_label_non_match,
                                  self.single_pred, LearningPhase.TRAINING)
        self.assertEqual(tally.accuracy, 0)

        # Two splits, correct predictions
        tally = self.tally_result(self.batch_label_matching, self.batch_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.accuracy, 1)

        # Two splits, incorrect predictions
        tally = self.tally_result(self.batch_label_non_match, self.batch_pred,
                                  LearningPhase.TRAINING)
        self.assertEqual(tally.accuracy, 0.5)

    #------------------------------------
    # test_result_collection_generator
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_result_collection_generator(self):
        '''
        Generator functionality of TrainCollection.
        Should deliver sequence of ResultTally instances.
        '''
        # Epoch 1, learning phase TRAINING
        _tally_ep1_lp_train1 = self.tally_result(self.ten_labels_perfect,
                                                 self.ten_results,
                                                 LearningPhase.TRAINING,
                                                 epoch=1)
        # Epoch 2, learning phase TRAINING
        _tally_ep2_lp_train2 = self.tally_result(self.ten_labels_perfect,
                                                 self.ten_results,
                                                 LearningPhase.TRAINING,
                                                 epoch=2)
        # Epoch 3, learning phase TRAINING
        _tally_ep3_lp_train3 = self.tally_result(self.ten_labels_first_wrong,
                                                 self.ten_results,
                                                 LearningPhase.TRAINING,
                                                 epoch=3)
        # Second Epoch 1 result:
        _tally_ep1_lp_test1 = self.tally_result(self.ten_labels_first_wrong,
                                                self.ten_results,
                                                LearningPhase.TESTING,
                                                epoch=1)

        tallies_sorted = [
            _tally_ep1_lp_train1, _tally_ep2_lp_train2, _tally_ep3_lp_train3,
            _tally_ep1_lp_test1
        ]

        # All tallies, sorted by time:
        tallies = list(self.tally_collection.tallies())
        self.assertEqual(tallies, tallies_sorted)

        # All TRAINING tallies, sorted by time:
        tallies = list(
            self.tally_collection.tallies(
                learning_phase=LearningPhase.TRAINING))
        self.assertEqual(tallies, tallies_sorted[:3])

        # All TESTING tallies, sorted by time:
        tallies = list(
            self.tally_collection.tallies(
                learning_phase=LearningPhase.TESTING))
        self.assertTrue(tallies[0] == tallies_sorted[3])

        # All tallies, sorted by time, but only testing in epoch 2:
        tallies = list(
            self.tally_collection.tallies(
                epoch=2, learning_phase=LearningPhase.TESTING))

    #------------------------------------
    # test_collection_num_classes
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_collection_num_classes(self):
        '''
        Whether collections properly ask their
        first ResultTally instance for the number
        of classes
        '''

        # Nothing added to collection, num_classes
        # should be 0
        self.assertEqual(len(self.tally_collection), 0)

        _tally1 = self.tally_result(self.ten_labels_perfect,
                                    self.ten_results,
                                    LearningPhase.TRAINING,
                                    epoch=1)
        # Epoch 1, learning phase TRAINING
        _tally2 = self.tally_result(self.ten_labels_perfect,
                                    self.ten_results,
                                    LearningPhase.TRAINING,
                                    epoch=1)
        self.tally_collection.add(_tally1, 1)
        self.tally_collection.add(_tally2, 1)
        # Because results are equal should still only
        # have one result in collection:
        self.assertEqual(len(self.tally_collection), 1)

    #------------------------------------
    # test_copy
    #-------------------

    @unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
    def test_copy(self):
        tally1 = self.tally_result(self.ten_labels_perfect,
                                   self.ten_results,
                                   LearningPhase.TRAINING,
                                   epoch=1)
        new_col = ResultCollection.create_from(self.tally_collection)

        # Contents of new collection should be same:
        self.assertEqual(len(new_col), 1)
        new_tally = list(new_col.tallies())[0]
        self.assertTrue(new_tally == tally1)

        for tally_old, tally_new in zip(self.tally_collection.tallies(),
                                        new_col.tallies()):
            self.assertTrue(tally_old == tally_new)

    # ****** Needs thinking and debugging in result_tallying
#     #------------------------------------
#     # test_within_class_recall_aggregation
#     #-------------------
#
#     #****@unittest.skipIf(TEST_ALL != True, 'skipping temporarily')
#     def test_within_class_recall_aggregation(self):
#         tally1 = self.tally_result(
#                               0, # Split number
#                               self.single_label_matching,
#                               self.single_pred,
#                               LearningPhase.TRAINING
#                               )
#         tally2 = self.tally_result(
#                               0, # Split number
#                               self.single_label_non_match,
#                               self.single_pred,
#                               LearningPhase.TRAINING
#                               )
#         # Because only one class represented,
#         # the others will be nan:
#         within_class_recalls1 = tally1.within_class_recalls()
#         within_class_recalls2 = tally2.within_class_recalls()
#
#         agg_within_class_recall = (within_class_recalls1 + within_class_recalls2) / 2.0
#         for idx in range(len(agg_within_class_recall)):
#             if idx in [0,1,3]:
#                 self.assertTrue(torch.isnan(agg_within_class_recall[idx]))
#             else:
#                 self.assertEqual(agg_within_class_recall[idx], 0.5)
#
#         # Larger batch:
#
#         tally1 = self.tally_result(
#                             0, # Split number
#                             self.ten_labels_perfect,
#                             self.ten_results,
#                             LearningPhase.TRAINING
#                             )
#
#         tally2 = self.tally_result(
#                             0, # Split number
#                             self.ten_labels_first_wrong,
#                             self.ten_results,
#                             LearningPhase.TRAINING
#                             )
#
#         recalls1 = tally1.within_class_recalls()
#         recalls2 = tally2.within_class_recalls()
#
#         mean_within_class_recall = self.tally_collection.mean_within_class_recall()
#         print('foo')

# ---------------- Utils ------------

#------------------------------------
# tally_result
#-------------------

    def tally_result(self, labels_tns, pred_prob_tns, learning_phase, epoch=1):
        '''
        Copy of BirdTrainer's tally_result for
        testing the tallying facility:
        '''
        # Predictions are for one batch. Example for
        # batch_size 2 and 4 target classes:
        #
        #    torch.tensor([[1.0, -2.0,  3.4,  4.2],
        #                  [4.1,  3.0, -2.3, -1.8]
        #                  ])
        # get:
        #     torch.return_types.max(
        #     values=tensor([4.2, 4.1]),
        #     indices=tensor([3, 0]))
        #
        # The indices are the class predictions:

        max_logits_rowise = torch.max(pred_prob_tns, dim=1)
        pred_class_ids = max_logits_rowise.indices

        # Use a random loss value:
        loss = torch.tensor(0.14)
        batch_size = 2
        tally = ResultTally(epoch, learning_phase, pred_prob_tns, labels_tns,
                            loss, self.num_classes, batch_size)

        self.tally_collection.add(tally)
        return tally
示例#10
0
    def __init__(self, config_info, debugging=False):
        '''
        Constructor
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        # No cross validation:
        self.folds = 0
        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)
        self.train_loader, self.val_loader = self.get_dataloader(
            sample_width, sample_height)
        self.class_names = self.train_loader.dataset.classes

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here (two per epoch, for
        # for all training loop runs, and one
        # for all val loop runs:

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_epoch = self.train()
            self.visualize_final_epoch_results(final_epoch)
        finally:
            self.close_tensorboard()
示例#11
0
    def run_inference(self, gpu_to_use=0):
        '''
        Runs model over dataloader. Along
        the way: creates ResultTally for each
        batch, and maintains dict instance variable
        self.raw_results for later conversion of
        logits to class IDs under different threshold
        assumptions. 
        
        self.raw_results: 
                {'all_outputs' : <arr>,
                 'all_labels'  : <arr>
                 }
        
        Returns a ResultCollection with the
        ResultTally instances of each batch.

        :param gpu_to_use: which GPU to deploy to (if it is available)
        :type gpu_to_use: int
        :return: collection of tallies, one for each batch,
            or None if something went wrong.
        :rtype: {None | ResultCollection}
        '''
        # Just in case the loop never runs:
        batch_num = -1
        overall_start_time = datetime.datetime.now()

        try:
            try:
                if torch.cuda.is_available():
                    self.model.load_state_dict(torch.load(self.model_path))
                    FileUtils.to_device(self.model, 'gpu', gpu_to_use)
                else:
                    self.model.load_state_dict(
                        torch.load(self.model_path,
                                   map_location=torch.device('cpu')))
            except RuntimeError as e:
                emsg = repr(e)
                if emsg.find("size mismatch for conv1") > -1:
                    emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?"
                    raise RuntimeError(emsg) from e

            loss_fn = nn.CrossEntropyLoss()

            result_coll = ResultCollection()

            # Save all per-class logits for ability
            # later to use different thresholds for
            # conversion to class IDs:

            all_outputs = []
            all_labels = []

            self.model.eval()
            num_test_samples = len(self.loader.dataset)
            self.log.info(
                f"Begin inference ({num_test_samples} test samples)...")

            samples_processed = 0

            loop_start_time = overall_start_time
            with torch.no_grad():

                for batch_num, (batch, targets) in enumerate(self.loader):
                    if torch.cuda.is_available():
                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')
                    else:
                        images = batch
                        labels = targets

                    outputs = self.model(images)
                    loss = loss_fn(outputs, labels)

                    images = FileUtils.to_device(images, 'cpu')
                    outputs = FileUtils.to_device(outputs, 'cpu')
                    labels = FileUtils.to_device(labels, 'cpu')
                    loss = FileUtils.to_device(loss, 'cpu')

                    #**********
                    max_logit = outputs[0].max().item()
                    max_idx = (outputs.squeeze() == max_logit).nonzero(
                        as_tuple=False).item()
                    smpl_id = torch.utils.data.dataloader.sample_id_seq[-1]
                    lbl = labels[0].item()
                    pred_cl = max_idx

                    self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl)
                    #**********

                    # Specify the batch_num in place
                    # of an epoch, which is not applicatble
                    # during testing:
                    tally = ResultTally(batch_num, LearningPhase.TESTING,
                                        outputs, labels, loss,
                                        self.num_classes, self.batch_size)
                    result_coll.add(tally, step=None)

                    all_outputs.append(outputs)
                    all_labels.append(labels)

                    samples_processed += len(labels)

                    del images
                    del outputs
                    del labels
                    del loss

                    torch.cuda.empty_cache()

                    time_now = datetime.datetime.now()
                    # Sign of life every 6 seconds:
                    if (time_now - loop_start_time).seconds >= 5:
                        self.log.info(
                            f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples"
                        )
                        loop_start_time = time_now
        finally:

            #*********
            print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}")
            torch.utils.data.dataloader.sample_id_seq = []
            #*********
            time_now = datetime.datetime.now()
            test_time_duration = time_now - overall_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(test_time_duration,
                                                    granularity=4)
            self.log.info(
                f"Done with inference: {samples_processed} test samples; {duration_str}"
            )
            # Total number of batches we ran:
            num_batches = 1 + batch_num  # b/c of zero-base

            # If loader delivered nothing, the loop
            # never ran; warn, and get out:
            if num_batches == 0:
                self.log.warn(
                    f"Dataloader delivered no data from {self.samples_path}")
                self.close()
                return None

            # Var all_outputs is now:
            #  [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0
            #   tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1
            #                     ...
            #   ]
            # Make into one tensor: (num_batches, batch_size, num_classes),
            # unless an exception was raised at some point,
            # throwing us into this finally clause:
            if len(all_outputs) == 0:
                self.log.info(
                    f"No outputs were produced; thus no results to report")
                return None

            self.all_outputs_tn = torch.stack(all_outputs)
            # Be afraid...be very afraid:
            assert(self.all_outputs_tn.shape == \
                   torch.Size([num_batches,
                               self.batch_size,
                               self.num_classes])
                   )

            # Var all_labels is now num-batches tensors,
            # each containing batch_size labels:
            assert (len(all_labels) == num_batches)

            # list of single-number tensors. Make
            # into one tensor:
            self.all_labels_tn = torch.stack(all_labels)
            assert(self.all_labels_tn.shape == \
                   torch.Size([num_batches, self.batch_size])
                   )
            # And equivalently:
            assert(self.all_labels_tn.shape == \
                   (self.all_outputs_tn.shape[0],
                    self.all_outputs_tn.shape[1]
                    )
                   )

            self.report_results(result_coll)
            self.close()

        return result_coll
示例#12
0
    def load_preds_and_labels(cls, csv_path):
        '''
        Returns a ResultCollection. Each 
        ResultTally in the collection holds
        the outcomes of one epoch:
        
                created_at
                phase
                epoch
                num_classes
                batch_size
                preds
                labels
                mean_loss
                losses
        
        :param csv_path: path to CSV file with info
            from a past run
        :type csv_path: str
        '''

        # Deferred import to avoid import circularity
        # with modules that use both the result_tallying
        # and utilities modules:

        from birdsong.result_tallying import ResultCollection, ResultTally

        coll = ResultCollection()
        csv_fname = os.path.basename(csv_path)

        if not os.path.exists(csv_path):
            raise ValueError(f"Path to csv file {csv_path} does not exist")

        # Get info encoded in the filename:
        prop_dict = cls.parse_filename(csv_fname)

        num_classes = prop_dict['num_classes']
        batch_size = prop_dict['batch_size']

        # Remove the above entries from
        # prop_dict, so we can pass the
        # rest into ResultTally as kwargs
        # with info beyond what ResultTally
        # requires as args:

        del prop_dict['num_classes']
        del prop_dict['batch_size']

        with open(csv_path, 'r') as fd:
            reader = csv.reader(fd)
            # Eat the header line:
            next(reader)
            for (epoch, train_preds, train_labels, val_preds,
                 val_labels) in reader:

                # All elements are strings.
                # Turn them into natives. The
                # additional parms to eval() make
                # the eval safe by withholding
                # built-ins and any libs:

                train_preds_arr = eval(
                    train_preds,
                    {"__builtins__": None},  # No built-ins at all
                    {}  # No additional func
                )
                train_labels_arr = eval(
                    train_labels,
                    {"__builtins__": None},  # No built-ins at all
                    {}  # No additional func
                )
                val_preds_arr = eval(
                    val_preds,
                    {"__builtins__": None},  # No built-ins at all
                    {}  # No additional func
                )

                val_labels_arr = eval(
                    val_labels,
                    {"__builtins__": None},  # No built-ins at all
                    {}  # No additional func
                )

                epoch = int(epoch)

                train_tally = ResultTally(
                    epoch,
                    LearningPhase.TRAINING,
                    torch.tensor(train_preds_arr),
                    torch.tensor(train_labels_arr),
                    0.0,  # Placeholder for loss
                    num_classes,
                    batch_size,
                    prop_dict  # Additional, option  info
                )  # from the file name

                val_tally = ResultTally(
                    epoch,
                    LearningPhase.VALIDATING,
                    torch.tensor(val_preds_arr),
                    torch.tensor(val_labels_arr),
                    0.0,  # Placeholder for loss
                    num_classes,
                    batch_size,
                    prop_dict  # Additional, option  info
                )  # from the file name

                coll.add(train_tally, epoch)
                coll.add(val_tally, epoch)

        return coll