Пример #1
0
    def __init__(self, logdir, **kwargs):
        """
        Get the SummaryWriter singleton instance or create a new one if not exisiting yet.

        This class writes summaries to TensorBoard and MLFlow. It is designed as a singleton,
        so that each part of the program uses the same instance. Once constructed the constructor
        always returns the same instance regardless of parameters passed.

        The summary writer will not write to disk when debugging is detected to avoid cluttering
        the results directory.

        :param logdir: name of log directory
        :param kwargs: kwargs for underlying TensorBoardX summary writer.
        """
        if not os.path.exists(LOG_ROOT):
            os.makedirs(LOG_ROOT)

        dirs = sorted(os.listdir(LOG_ROOT), reverse=True)
        num = 0
        for d in dirs:
            if logdir in d:
                num = int(d[:3]) + 1
                break

        logdir = str(num).zfill(3) + '_' + logdir
        logdir = os.path.join(LOG_ROOT, logdir)

        if utils.is_debugging():
            warnings.warn(
                'Debugging mode: will write to temporary TensorBoard file.',
                UserWarning)
            logdir = utils.build_tmp_dir()

        super(SummaryWriter, self).__init__(logdir, flush_secs=60, **kwargs)
Пример #2
0
    def __init__(self,
                 train_data,
                 eval_data,
                 batch_size,
                 shuffle=True,
                 dvc_file=None):
        """
        Construct a new dataset.

        This dataset contains the training and evaluation data for a dataset. It can
        return a dataloader for each of them with the predetermined batch size and
        shuffle. The DVC file of the dataset on disk will be logged to mlflow as
        an artiact.

        :param train_data: torch.Dataset of the training data
        :param eval_data: torch.Dataset of the evaluation data
        :param batch_size: default batch size
        :param shuffle: default shuffle
        :param dvc_file: dvc file path of data
        """
        self.train_data = train_data
        self.eval_data = eval_data

        self.batch_size = batch_size
        self.shuffle = shuffle

        self.dvc_file = dvc_file
        if self.dvc_file is not None and not utils.is_debugging():
            dvc_file = dvc_file if isinstance(dvc_file,
                                              tuple) else (dvc_file, )
            for f in dvc_file:
                mlflow.log_artifact(f, artifact_path='data_version')
Пример #3
0
    def __init__(self, file_list_path, train_data=True, has_label=True,
                 transform=None, split=0.8):
        df_train = pd.read_csv(file_list_path)
        df_value = df_train.values
        if has_label:
            split_index = int(df_value.shape[0] * split)
            if train_data:
                split_data = df_value[:split_index]
            else:
                split_data = df_value[split_index:]
            if utils.is_debugging():
                split_data = df_value[:64]
            # print(split_data.shape)
            file_names = [None] * split_data.shape[0]
            labels = [None] * split_data.shape[0]

            for index, line in enumerate(split_data):
                f, invasive = line
                file_names[index] = os.path.join(settings.TRAIN_DIR,
                                                 str(f) + '.jpg')
                labels[index] = invasive

            self.labels = np.array(labels, dtype=np.float32)
        else:
            file_names = [None] * df_train.values.shape[0]
            for index, line in enumerate(df_train.values):
                f, invasive = line
                file_names[index] = settings.TEST_DIR + '/' + str(
                    int(f)) + '.jpg'
                # print(filenames[:100])
        if utils.is_debugging():
            file_names = file_names[:64]
        self.transform = transform
        self.num = len(file_names)
        self.file_names = file_names
        self.train_data = train_data
        self.has_label = has_label

        self.images = []

        print("pre-reading images from files.")
        for file_name in tqdm.tqdm(file_names):
            self.images.append(pil_load(file_name))

        print("load %d images." % len(self.images))
Пример #4
0
 def start_dashboard(self):
     if utils.is_debugging():
         # TODO: Deal with plot UI not being in the main thread somehow - (move to browser?)
         log.warning('Dashboard not supported in debug mode')
         return
     q = Queue(maxsize=10)
     p = Process(target=dashboard_fn, args=(q, ))
     print('DEBUG - after starting dashboard')
     p.start()
     self.dashboard_process = p
     self.dashboard_queue = q
Пример #5
0
    def close(self):
        """Close the event file and add it with the model files to MLFlow."""
        super().close()

        if not utils.is_debugging():
            files = os.listdir(self.log_dir)
            event_file = [f for f in files if f.startswith('events')][0]
            model_files = [f for f in files if f.endswith('.pth')]

            mlflow.log_artifact(os.path.join(self.log_dir, event_file),
                                artifact_path='events')
            for m in model_files:
                mlflow.log_artifact(os.path.join(self.log_dir, m),
                                    artifact_path='models')
Пример #6
0
    def write_results(self, results, time_step, scalar_tab=''):
        """
        Write dictionary of results to TensorBoard and MLFlow.

        This convenience function takes a dictionary that specifies
        several different summaries to be written. The summaries will
        be passed to the right add_* function of the summary writer.
        Scalars will be logged to MLFlow, too.

        example: {'scalars': {'metric1': 0.1, 'metric2': 0.5}, 'images': {'img1': img_tensor}}

        :param results: dictionary of summaries
        :param time_step: time step to log for
        :param scalar_tab: prefix to use right tab in scalar overview in TensorBoard
        """
        if not scalar_tab.endswith('/'):
            scalar_tab += '/'

        if 'scalars' in results:
            for tag, scalar in results['scalars'].items():
                self.add_scalar(tag=scalar_tab + tag,
                                scalar_value=scalar,
                                global_step=time_step)
                if not utils.is_debugging():
                    mlflow_tag = (scalar_tab + tag).replace('/', '_')
                    mlflow.log_metric(mlflow_tag, scalar, time_step)

        if 'images' in results:
            for tag, image in results['images'].items():
                if image.dim() == 2:
                    formats = 'HW'
                elif image.dim() == 3:
                    formats = 'CHW' if image.shape[0] in [1, 3] else 'HWC'
                elif image.dim() == 4:
                    formats = 'NCHW' if image.shape[1] in [1, 3] else 'NHWC'
                else:
                    raise ValueError('Unknown image format with shape %s' %
                                     str(image.shape))

                self.add_images(tag, image, time_step, dataformats=formats)

        if 'series' in results:
            for tag, series in results['series'].items():
                plot = self._plot_series(tag, series)
                self.add_figure(tag, plot, time_step, close=True)

        if 'embeddings' in results:
            for tag, embedding in results['embeddings'].items():
                self.add_embedding(**embedding, tag=tag, global_step=time_step)
Пример #7
0
    def save(self, obj, name, tag):
        """
        Save a model to file.

        The model is saved to the log directory as 'name_tag.pth'

        :param obj: model to save
        :param name: name of the save file
        :param tag: prefix tag
        """
        if utils.is_debugging():
            warnings.warn(
                'Debugging mode: will save model checkpoint to temporary dir.',
                UserWarning)

        file_name = os.path.join(self.log_dir, name + '_' + tag + '.pth')
        torch.save(obj, file_name)
Пример #8
0
    def add_data(df_values, dir_path, label_threshold=None):
        data_set = []

        count = 0
        for line in tqdm.tqdm(df_values):
            image_name, invasive = line
            image_path = os.path.join(dir_path, str(int(image_name)) + '.jpg')

            if label_threshold is not None:
                if invasive >= label_threshold:
                    label = 1.0
                else:
                    label = 0
            else:
                label = invasive
            image_data = ImageData(image_path, np.float32(label))
            image_data.image = pil_load(image_data.path)
            data_set.append(image_data)
            count += 1
            if utils.is_debugging() and count == 20:
                print("break image pre-reads for debugging purpose.")
                break

        return data_set
Пример #9
0
def run(config, device, epochs, replications, seed, num_data_workers):
    """
    Run an experiment of the given config.

    A MLFlow experiment will be set according to
    the name in the config. A BaseTask will be build
    and the train function called. Each call of the run function
    with the same config will be a run of this experiment.
    If replications is set to a number bigger than one, a nested
    run is created and the task executed this number of times.

    When debugging, nothing is written to disk to avoid
    cluttering the results directory.

    :param config: path to the config JSON file or config dict
    :param device: device to train on
    :param epochs: epochs to train for
    :param replications: number of times to replicate this run
    :param seed: random seed to use
    :param num_data_workers: number of worker threads for data loading
    """
    # Set seed for randomization
    if seed is not None:
        # Make PyTorch and numpy deterministic
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        print('Fixed randomization. Seed %d' % seed)
        print('#' * 40)
    else:
        # Retrieve default seed, as it is not set
        seed = np.random.randint(np.iinfo(np.int32).max)
        torch.manual_seed(seed)
        np.random.seed(seed)

    # Load config JSON
    if isinstance(config, str):
        print('Run experiment from %s' % config)
        print('#' * 40)
        config = utils.read_config(config)
    elif isinstance(config, dict):
        print('Run experiment with dict named %s' % config['name'])
        print('#' * 40)
    else:
        raise ValueError(
            'Config has to be either a string path or a dict, but is %s.' %
            str(type(dict)))

    # Extract config dicts for components
    name = config['name']
    dataset = config['dataset']
    model = config['model']
    trainer = config['trainer']
    metrics = config['metrics']

    # Setup mlflow experiment
    if utils.is_debugging():
        # Reroute mlflow to tmp file on debugging
        warnings.warn(
            'Debugging mode: MLFlow stuff will be saved to temporary dir.',
            UserWarning)
        mlflow.set_tracking_uri('file:' + utils.build_tmp_dir())
    else:
        script_path = os.path.dirname(__file__)
        root_path = os.path.dirname(script_path)
        mlflow.set_tracking_uri('file:' + root_path)
    mlflow.set_experiment(name)

    # Start the top level run
    nest_runs = True if replications > 0 else False
    with mlflow.start_run(nested=nest_runs):
        # Log parameters to run
        utils.log_config(config)
        mlflow.log_param('max_epochs', epochs)
        mlflow.log_param('seed', seed)
        mlflow.set_tag('device', device)

        if nest_runs:
            # Open child runs for each replication
            mlflow.log_param('replications', replications)
            seeds = np.random.randint(np.iinfo(np.int32).max,
                                      size=replications)
            for i, s in enumerate(seeds):
                print('Run replication %d/%d...' % (i, replications))
                with mlflow.start_run(nested=True):
                    # Log params to child runs
                    utils.log_config(config)
                    mlflow.set_tag('replication', i)

                    # Set derived seed for child runs to make each reproducible
                    mlflow.log_param('seed', s)
                    torch.manual_seed(s)
                    np.random.seed(s)

                    # Execute run
                    task = BaseTask(name, device, dataset, model, trainer,
                                    metrics)
                    task.train(epochs, num_data_workers)
        else:
            # Simply execute top level run, when replications are zero
            task = BaseTask(name, device, dataset, model, trainer, metrics)
            task.train(epochs, num_data_workers)