Пример #1
0
def main(output_folder, do_download, gdc_executable, manifest_filepath,
         source_slides_folder):
    logger = get_logger(filename_handler='code.data_processing.' +
                        os.path.basename(__file__) + '_' +
                        strftime("%Y-%m-%d_%H:%M:%S", gmtime()) + '.log')

    logger.info('Data_processing control parameters:')
    logger.info('  do_download: ' + str(do_download))
    logger.info('  gdc_executable: ' + str(gdc_executable))
    logger.info('  manifest_filepath: ' + str(manifest_filepath))
    logger.info('  source_slides_folder: ' + str(source_slides_folder))
    logger.info('  Meta-parameters:')
    logger.info('    desired_magnification %s' % str(desired_magnification))
    logger.info('    tile_width %s' % str(desired_tile_width))
    logger.info('    expected_tile_shape %s' % str(expected_tile_shape))
    logger.info('    background_threshold %s' % str(background_threshold))
    logger.info('    background_pixel_value %s' % str(background_pixel_value))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Download SVS files using manifest and GDC extraction tool
    if do_download:
        if not os.path.exists(
                os.path.join(output_folder, 'has_been_downloaded')):
            if manifest_filepath is None or gdc_executable is None:
                raise ValueError(
                    'Download from TCGA is on: need both --gdc-executable and --manifest arguments filled'
                )
            if not os.path.exists(gdc_executable):
                raise FileNotFoundError(
                    f'Specified GDC executable {gdc_executable} not found')
            if not os.path.exists(manifest_filepath):
                raise FileNotFoundError(
                    f'Specified manifest file {manifest_filepath} not found')

            crude_slides_output_folder = os.path.join(output_folder,
                                                      'downloaded_slides')
            if not os.path.exists(crude_slides_output_folder):
                os.makedirs(crude_slides_output_folder)

            logger.info(
                f'Downloading slides into {crude_slides_output_folder}...')
            slides_filepaths, md5sums, cases_ids = svs_factory.download_svs_files(
                gdc_executable, manifest_filepath, crude_slides_output_folder)

            # Write control file after download is finished
            with open(os.path.join(output_folder, 'has_been_downloaded'),
                      'w') as f:
                f.write('\n'.join(
                    ','.join(a)
                    for a in zip(slides_filepaths, md5sums, cases_ids)))
            logger.info('  done')
        else:
            logger.info('Slides already downloaded -> skipping')
            # Retrieve all downloaded SVS files in case previous step not performed
            with open(os.path.join(output_folder, 'has_been_downloaded'),
                      'r') as f:
                download_content = f.read().splitlines()
            download_content = list(
                map(lambda line: line.split(','), download_content))
            slides_filepaths, md5sums, cases_ids = list(
                map(list, zip(*download_content)))
    else:
        if source_slides_folder is None:
            raise ValueError(
                'No download from TCGA: need --source-slides-folder argument filled with folder '
                'containing slides to process')
        elif not os.path.exists(source_slides_folder):
            raise FileNotFoundError(
                f'Input folder {source_slides_folder} with slides to be processed not found'
            )

        logger.info(
            f'Performing no download from TCGA as requested; listing source slides from {source_slides_folder}'
        )
        slides_filepaths = svs_factory.list_slides_in_folder(
            source_slides_folder)
        # if not slide filepaths retrieved, try regime where slides are inside sup-folders
        if len(slides_filepaths) == 0:
            slides_filepaths = svs_factory.list_slides_in_folder(
                source_slides_folder, with_supfolder=True)
        md5sums = ['no_md5sum'] * len(slides_filepaths)
        cases_ids = ['no_case_id'] * len(slides_filepaths)

    # Tile all slides into super-patches
    has_been_tiled_filename = 'has_been_tiled_mag%d' % desired_magnification
    if not os.path.exists(
            os.path.join(output_folder,
                         'has_been_tiled_mag%d' % desired_magnification)):
        logger.info('Tiling slides into super-patches...')
        logger.info('  found %d files to be processed' % len(slides_filepaths))

        output_tiles_folders = svs_factory.tile_slides(slides_filepaths,
                                                       desired_tile_width,
                                                       desired_overlap,
                                                       desired_magnification)

        assert None not in output_tiles_folders
        assert len(output_tiles_folders) == len(slides_filepaths)

        # Write control file after tile processing is finished with tiles folders, slide names, md5sum and cases ID
        with open(os.path.join(output_folder, has_been_tiled_filename),
                  'w') as f:
            f.write('\n'.join(
                ','.join(a)
                for a in zip(output_tiles_folders,
                             list(map(os.path.basename, slides_filepaths)),
                             md5sums, cases_ids)))
    else:
        logger.info('Slides already tiled at magnification %d -> skipping' %
                    desired_magnification)

    # Independently from previous processing, extract labels of SVS files
    filtered_tiles_output_folder = os.path.join(output_folder,
                                                'filtered_tiles')
    has_been_filtered_filename = os.path.join(output_folder,
                                              'has_been_moved_and_filtered')
    if not os.path.exists(
            os.path.join(output_folder, 'has_been_moved_and_filtered')):
        logger.info('Extracting labels...')
        # Retrieve directories in which the super-patches are located in case previous step not performed
        with open(os.path.join(output_folder, has_been_tiled_filename),
                  'r') as f:
            tiled_content = f.read().splitlines()
        tiled_content = list(map(lambda line: line.split(','), tiled_content))
        output_tiles_folders, svs_filenames, md5sums, cases_ids = list(
            map(list, zip(*tiled_content)))

        associated_labels = list(
            map(case_factory.infer_class_from_tcga_name, svs_filenames))
        logger.info('  done')

        logger.info('Moving+background-filtering tiles into %s' %
                    output_folder)
        data_folders = svs_factory.move_and_filter_tiles_folders(
            output_tiles_folders,
            associated_labels,
            svs_filenames,
            cases_ids,
            filtered_tiles_output_folder,
            background_pixel_value,
            background_threshold,
            expected_tile_shape,
            logger=logger)
        logger.info('  done')

        open(has_been_filtered_filename, 'w').write('\n'.join(data_folders))
        logger.info('Wrote `has_been_moved_and_filtered` file')
    else:
        logger.info('Tiles already moved and filtered -> skipping')
        # seek classes folders
        # data_folders = [f for f in os.listdir(filtered_tiles_output_folder)
        #                 if not os.path.isfile(os.path.join(filtered_tiles_output_folder, f))]
        data_folders = open(has_been_filtered_filename,
                            'r').read().splitlines()
        logger.info('Found %d source slides folders in %s' %
                    (len(data_folders), filtered_tiles_output_folder))

        # logger.info('Performing train/val/test splitting with background removal')
        # train_cases_ids, val_cases_ids, test_cases_ids = case_factory.split_svs_samples_casewise(output_tiles_folders,
        #                                                                                          cases_ids,
        #                                                                                          train_size,
        #                                                                                          val_size,
        #                                                                                          test_size)

    logger.info('Pre-processing done')
    return list(map(lambda f: os.path.join(output_folder, f), data_folders))
def main(hyper_parameters):
    prefix_time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    logger = get_logger(filename_handler='code.' + __file__ + '_' +
                        prefix_time + '.log',
                        verbose=hyper_parameters['verbose'])
    logger.info('Hyper parameters')
    logger.info(pprint.pformat(hyper_parameters, indent=4))

    # Pre-processing should have been done beforehand, retrieve data by specifying data preprocessing output folder
    slides_folders = end_to_end_data_preprocessing(
        hyper_parameters['input_data_folder'], hyper_parameters['do_download'],
        hyper_parameters['gdc_executable'], hyper_parameters['manifest_file'],
        hyper_parameters['source_slides_folder'])

    logger.info('Initializing model... ')
    if not os.path.exists(hyper_parameters['models_save_folder']):
        os.makedirs(hyper_parameters['models_save_folder'])
    # Instantiate instance classifier model, then MIL wrapper
    instance_classifier, input_width = instantiate_model(
        model_type=hyper_parameters['underlying_model_type'],
        pretrained=hyper_parameters['underlying_model_pretrained'],
        n_classes=1)
    logger.info(
        f'  {hyper_parameters["underlying_model_type"]} architecture initialized'
    )
    mil_model = MaxMinMIL(instance_classifier,
                          alpha=hyper_parameters['alpha'],
                          beta=hyper_parameters['beta'],
                          cuda=hyper_parameters['cuda'])
    logger.info('  mil wrapper initialized')

    if hyper_parameters['underlying_model_load_from'] is not None:
        logger.warning('  Initializing model from %s' %
                       hyper_parameters['underlying_model_load_from'])
        mil_model.load_state_dict(
            torch.load(hyper_parameters['underlying_model_load_from']))

    if hyper_parameters['cuda']:
        n_devices = torch.cuda.device_count()
        if n_devices > 1:
            mil_model = nn.DataParallel(mil_model)
        mil_model.cuda()
    logger.info('done')

    optimizer = optim.Adam(mil_model.parameters(),
                           lr=hyper_parameters['learning_rate'],
                           weight_decay=hyper_parameters['weight_decay'])

    # Load data and split case-wise into train, val and test sets
    logger.info('Pre-loading all data...')
    train_dataset, val_dataset, test_dataset = build_datasets(
        source_slides_folders=slides_folders,
        model_input_width=input_width,
        hyper_parameters=hyper_parameters,
        logger=logger)
    logger.info('Train size %d' % len(train_dataset))
    logger.info('Val size %d' % len(val_dataset))
    logger.info('Test size %d' % len(test_dataset))

    train_dataloader = to_dataloader(train_dataset, True)
    val_dataloader = to_dataloader(val_dataset,
                                   False) if len(val_dataset) else None
    test_dataloader = to_dataloader(test_dataset,
                                    False) if len(test_dataset) else None

    # Instantiate summary writer if tensorboard activated
    if hyper_parameters['with_tensorboard']:
        summary_writer_filename = 'summary_' + prefix_time
        summary_writer_folder_path = os.path.join('tensorboard',
                                                  summary_writer_filename)
        summary_writer = SummaryWriter(log_dir=summary_writer_folder_path)
    else:
        summary_writer = None

    val_losses = []
    # Perform training loop: for each epoch, train and validate
    logger.info('Starting training...')
    start_training_time = time.time()
    for epoch in range(hyper_parameters['n_epochs']):
        # Train
        train_loss, train_savepath = perform_epoch(
            mil_model,
            optimizer,
            epoch,
            train_dataloader,
            hyper_parameters=hyper_parameters,
            is_training=True,
            logger=logger,
            set_name='training',
            prefix_time=prefix_time,
            summary_writer=summary_writer)

        # Validate
        if val_dataloader:
            with torch.no_grad():
                val_loss, _ = perform_epoch(mil_model,
                                            optimizer,
                                            epoch,
                                            val_dataloader,
                                            hyper_parameters=hyper_parameters,
                                            is_training=False,
                                            logger=logger,
                                            set_name='validation',
                                            prefix_time=prefix_time,
                                            summary_writer=summary_writer)

            # Early stopping
            val_losses.append(val_loss)
            do_stop, best_value = early_stopping(
                val_losses,
                patience=hyper_parameters['early_stopping_patience'])
            if do_stop:
                logger.warning(
                    'Early stopping triggered: stopping training after no improvement on val set for '
                    '%d epochs with value %.3f' %
                    (hyper_parameters['early_stopping_patience'], best_value))
                break

    logger.warning('Total training time %s' %
                   (time.time() - start_training_time))

    # Test
    if test_dataloader:
        logger.info('Starting testing...')
        with torch.no_grad():
            perform_epoch(mil_model,
                          optimizer,
                          -1,
                          test_dataloader,
                          hyper_parameters=hyper_parameters,
                          is_training=False,
                          logger=logger,
                          set_name='test',
                          prefix_time=prefix_time,
                          summary_writer=summary_writer)

    return
Пример #3
0
def main(source_folder, output_folder, gdc_executable_path):
    logger = get_logger(filename_handler='data_processing.log', verbose=True)
    logger.info('Source folder %s' %
                os.path.abspath(source_folder) if source_folder else 'None')
    logger.info('Output folder %s' % os.path.abspath(output_folder))
    logger.info('Meta-parameters:')
    logger.info('  desired_magnification %s' % str(desired_magnification))
    logger.info('  tile_width %s' % str(desired_tile_width))
    logger.info('  expected_tile_shape %s' % str(expected_tile_shape))
    logger.info('  background_threshold %s' % str(background_threshold))
    logger.info('  background_pixel_value %s' % str(background_pixel_value))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Download SVS files using manifest and GDC extraction tool
    if not os.path.exists(os.path.join(output_folder, 'has_been_downloaded')):
        logger.info('Downloading slides...')
        svs_filepaths, md5sums, cases_ids = svs_factory.download_svs_files(
            source_folder, gdc_executable_path)

        # Write control file after download is finished
        with open(os.path.join(output_folder, 'has_been_downloaded'),
                  'w') as f:
            f.write('\n'.join(','.join(a)
                              for a in zip(svs_filepaths, md5sums, cases_ids)))
        logger.info('  done')
    else:
        logger.info('Slides already downloaded -> skipping')

    # Tile all slides into super-patches
    if not os.path.exists(
            os.path.join(output_folder,
                         'has_been_tiled_mag%d' % desired_magnification)):
        logger.info('Tiling slides into super-patches...')
        # Retrieve all downloaded SVS files in case previous step not performed
        with open(os.path.join(output_folder, 'has_been_downloaded'),
                  'r') as f:
            download_content = f.read().splitlines()
        download_content = list(
            map(lambda line: line.split(','), download_content))
        svs_filepaths, md5sums, cases_ids = list(
            map(list, zip(*download_content)))
        logger.info('  found %d files to be processed' % len(svs_filepaths))

        output_tiles_folders = svs_factory.tile_slides(svs_filepaths,
                                                       desired_tile_width,
                                                       desired_overlap,
                                                       desired_magnification)

        assert None not in output_tiles_folders
        assert len(output_tiles_folders) == len(svs_filepaths)

        # Write control file after tile processing is finished with tiles folders, slide names, md5sum and cases ID
        with open(
                os.path.join(output_folder,
                             'has_been_tiled_mag%d' % desired_magnification),
                'w') as f:
            f.write('\n'.join(','.join(a) for a in zip(
                output_tiles_folders, list(map(
                    os.path.basename, svs_filepaths)), md5sums, cases_ids)))
    else:
        logger.info('Slides already tiled at magnification %d -> skipping' %
                    desired_magnification)

    # Independently from previous processing, extract labels of SVS files
    if not os.path.exists(
            os.path.join(output_folder, 'has_been_moved_and_filtered')):
        logger.info('Extracting labels...')
        # Retrieve directories in which the super-patches are located in case previous step not performed
        with open(
                os.path.join(output_folder,
                             'has_been_tiled_mag%d' % desired_magnification),
                'r') as f:
            tiled_content = f.read().splitlines()
        tiled_content = list(map(lambda line: line.split(','), tiled_content))
        output_tiles_folders, svs_filenames, md5sums, cases_ids = list(
            map(list, zip(*tiled_content)))

        associated_labels = list(
            map(case_factory.infer_class_from_tcga_name, svs_filenames))
        logger.info('  done')

        logger.info('Moving+background-filtering tiles into %s' %
                    output_folder)
        data_folders = svs_factory.move_and_filter_tiles_folders(
            output_tiles_folders,
            associated_labels,
            svs_filenames,
            cases_ids,
            output_folder,
            background_pixel_value,
            background_threshold,
            expected_tile_shape,
            logger=logger)
        logger.info('  done')

        open(os.path.join(output_folder, 'has_been_moved_and_filtered'),
             'w').write('\n'.join(data_folders))
        logger.info('Wrote `has_been_moved_and_filtered` file')
    else:
        logger.info('Tiles already moved and filtered -> skipping')
        # seek classes folders
        data_folders = [
            f for f in os.listdir(output_folder)
            if not os.path.isfile(os.path.join(output_folder, f))
        ]
        logger.info('Found %d source slides folders' % len(data_folders))

        # logger.info('Performing train/val/test splitting with background removal')
        # train_cases_ids, val_cases_ids, test_cases_ids = case_factory.split_svs_samples_casewise(output_tiles_folders,
        #                                                                                          cases_ids,
        #                                                                                          train_size,
        #                                                                                          val_size,
        #                                                                                          test_size)

    return list(map(lambda f: os.path.join(output_folder, f), data_folders))