def make_data_iterators(train_images, train_targets, test_images, test_targets, crop, split, args): from topaz.utils.data.sampler import StratifiedCoordinateSampler from torch.utils.data.dataloader import DataLoader ## training parameters minibatch_size = args.minibatch_size epoch_size = args.epoch_size num_epochs = args.num_epochs num_workers = args.num_workers if num_workers < 0: # set num workers to use all CPUs num_workers = mp.cpu_count() testing_batch_size = args.test_batch_size balance = args.minibatch_balance # ratio of positive to negative in minibatch if args.natural: balance = None report('minibatch_size={}, epoch_size={}, num_epochs={}'.format( minibatch_size, epoch_size, num_epochs)) ## create augmented training dataset train_dataset = make_traindataset(train_images, train_targets, crop) test_dataset = None if test_targets is not None: test_dataset = make_testdataset(test_images, test_targets) ## create minibatch iterators labels = train_dataset.data.labels sampler = StratifiedCoordinateSampler(labels, size=epoch_size * minibatch_size, balance=balance, split=split) train_iterator = DataLoader(train_dataset, batch_size=minibatch_size, sampler=sampler, num_workers=num_workers) test_iterator = None if test_dataset is not None: test_iterator = DataLoader(test_dataset, batch_size=testing_batch_size, num_workers=0) return train_iterator, test_iterator
def report_data_stats(train_images, train_targets, test_images, test_targets): report('source\tsplit\tp_observed\tnum_positive_regions\ttotal_regions') num_positive_regions = 0 total_regions = 0 for i in range(len(train_images)): p = sum(train_targets[i][j].sum() for j in range(len(train_targets[i]))) p = int(p) total = sum(train_targets[i][j].size for j in range(len(train_targets[i]))) num_positive_regions += p total_regions += total p_observed = p / total p_observed = '{:.3g}'.format(p_observed) report( str(i) + '\t' + 'train' + '\t' + p_observed + '\t' + str(p) + '\t' + str(total)) if test_targets is not None: p = sum(test_targets[i][j].sum() for j in range(len(test_targets[i]))) p = int(p) total = sum(test_targets[i][j].size for j in range(len(test_targets[i]))) p_observed = p / total p_observed = '{:.3g}'.format(p_observed) report( str(i) + '\t' + 'test' + '\t' + p_observed + '\t' + str(p) + '\t' + str(total)) return num_positive_regions, total_regions
def make_model(args): from topaz.model.factory import get_feature_extractor import topaz.model.classifier as C from topaz.model.classifier import LinearClassifier report('Loading model:', args.model) if args.model.endswith('.sav'): # loading pretrained model model = torch.load(args.model) model.train() return model report('Model parameters: units={}, dropout={}, bn={}'.format( args.units, args.dropout, args.bn)) # initialize the model units = args.units dropout = args.dropout bn = args.bn == 'on' pooling = args.pooling unit_scaling = args.unit_scaling feature_extractor = get_feature_extractor(args.model, units, dropout=dropout, bn=bn, unit_scaling=unit_scaling, pooling=pooling) classifier = C.LinearClassifier(feature_extractor) ## if the method is generative, create the generative model as well generative = None if args.autoencoder > 0: from topaz.model.generative import ConvGenerator ngf = args.ngf depth = int(np.log2(classifier.width + 1) - 3) generative = ConvGenerator(classifier.latent_dim, units=ngf, depth=depth) ## attach the generative model classifier.generative = generative report('Generator: units={}, size={}'.format(ngf, generative.width)) report('Receptive field:', classifier.width) return classifier
def report_data_stats(train_images, train_targets, test_images, test_targets): report('source\tsplit\tp\ttotal') num_positive_regions = 0 total_regions = 0 for i in range(len(train_images)): p = sum(train_targets[i][j].sum() for j in range(len(train_targets[i]))) total = sum(train_targets[i][j].size for j in range(len(train_targets[i]))) num_positive_regions += p total_regions += total p = p / total report(str(i) + '\t' + 'train' + '\t' + str(p) + '\t' + str(total)) if test_targets is not None: p = sum(test_targets[i][j].sum() for j in range(len(test_targets[i]))) total = sum(test_targets[i][j].size for j in range(len(test_targets[i]))) p = p / total report(str(i) + '\t' + 'test' + '\t' + str(p) + '\t' + str(total)) return num_positive_regions, total_regions
def main(args): ## initialize the model classifier = make_model(args) if args.describe: ## only print a description of the model and terminate print(classifier) sys.exit() ## set the device """ use_cuda = False if args.device >= 0: use_cuda = torch.cuda.is_available() if use_cuda: torch.cuda.set_device(args.device) else: print('WARNING: you specified GPU (device={}) but no GPUs were detected. This may mean there is a mismatch between your system CUDA version and your pytorch CUDA version.'.format(args.device), file=sys.stderr) """ use_cuda = topaz.cuda.set_device(args.device) report('Using device={} with cuda={}'.format(args.device, use_cuda)) if use_cuda: classifier.cuda() ## load the data radius = args.radius # number of pixels around coordinates to label as positive train_images, train_targets, test_images, test_targets = \ load_data(args.train_images, args.train_targets, args.test_images, args.test_targets, radius, format_=args.format_, k_fold=args.k_fold, fold=args.fold, cross_validation_seed=args.cross_validation_seed, image_ext=args.image_ext ) num_positive_regions, total_regions = report_data_stats( train_images, train_targets, test_images, test_targets) ## make the training step method if args.num_particles > 0: expected_num_particles = args.num_particles # make this expected particles in training set rather than per micrograph num_micrographs = sum(len(images) for images in train_images) expected_num_particles *= num_micrographs # given the expected number of particles and the radius # calculate what pi should be # pi = pixels_per_particle*expected_number_of_particles/pixels_in_dataset grid = np.linspace(-radius, radius, 2 * radius + 1) xx = np.zeros((2 * radius + 1, 2 * radius + 1)) + grid[:, np.newaxis] yy = np.zeros((2 * radius + 1, 2 * radius + 1)) + grid[np.newaxis] d2 = xx**2 + yy**2 mask = (d2 <= radius**2).astype(int) pixels_per_particle = mask.sum() # total_regions is number of regions in the data pi = pixels_per_particle * expected_num_particles / total_regions report( 'Specified expected number of particle per micrograph = {}'.format( args.num_particles)) report('With radius = {}'.format(radius)) report('Setting pi = {}'.format(pi)) else: pi = args.pi report('pi = {}'.format(pi)) trainer, criteria, split = make_training_step_method( classifier, num_positive_regions, num_positive_regions / total_regions, lr=args.learning_rate, l2=args.l2, method=args.method, pi=pi, slack=args.slack, autoencoder=args.autoencoder) ## training parameters train_iterator, test_iterator = make_data_iterators( train_images, train_targets, test_images, test_targets, classifier.width, split, args) ## fit the model, report train/test stats, save model if required output = sys.stdout if args.output is None else open(args.output, 'w') save_prefix = args.save_prefix #if not os.path.exists(os.path.dirname(save_prefix)): # os.makedirs(os.path.dirname(save_prefix)) fit_epochs(classifier, criteria, trainer, train_iterator, test_iterator, args.num_epochs, save_prefix=save_prefix, use_cuda=use_cuda, output=output) report('Done!')
def load_data(train_images, train_targets, test_images, test_targets, radius, k_fold=0, fold=0, cross_validation_seed=42, format_='auto', image_ext=''): # if train_images is a directory path, map to all images in the directory if os.path.isdir(train_images): paths = glob.glob(train_images + os.sep + '*' + image_ext) valid_paths = [] image_names = [] for path in paths: name = os.path.basename(path) name, ext = os.path.splitext(name) if ext in ['.mrc', '.tiff', '.png']: image_names.append(name) valid_paths.append(path) train_images = pd.DataFrame({ 'image_name': image_names, 'path': valid_paths }) else: train_images = pd.read_csv(train_images, sep='\t') # training image file list #train_targets = pd.read_csv(train_targets, sep='\t') # training particle coordinates file train_targets = file_utils.read_coordinates(train_targets, format=format_) # check for source columns if 'source' not in train_images and 'source' not in train_targets: train_images['source'] = 0 train_targets['source'] = 0 # load the images and create target masks from the particle coordinates train_images = load_images_from_list(train_images.image_name, train_images.path, sources=train_images.source) # discard coordinates for micrographs not in the set of images # and warn the user if any are discarded names = set() for k, d in train_images.items(): for name in d.keys(): names.add(name) check = train_targets.image_name.apply(lambda x: x in names) missing = train_targets.image_name.loc[~check].unique().tolist() if len(missing) > 0: print( 'WARNING: {} micrographs listed in the coordinates file are missing from the training images. Image names are listed below.' .format(len(missing)), file=sys.stderr) print('WARNING: missing micrographs are: {}'.format(missing), file=sys.stderr) train_targets = train_targets.loc[check] # check that the particles roughly fit within the images # if they don't, the user may not have scaled the particles/images correctly width = 0 height = 0 for k, d in train_images.items(): for image in d.values(): w, h = image.size if w > width: width = w if h > height: height = h out_of_bounds = (train_targets.x_coord > width) | (train_targets.y_coord > height) count = out_of_bounds.sum() if count > int( 0.1 * len(train_targets) ): # arbitrary cutoff of more than 10% of particles being out of bounds... print( 'WARNING: {} particle coordinates are out of the micrograph dimensions. Did you scale the micrographs and particle coordinates correctly?' .format(count), file=sys.stderr) # also check that the coordinates fill most of the micrograph x_max = train_targets.x_coord.max() y_max = train_targets.y_coord.max() if x_max < 0.7 * width and y_max < 0.7 * height: # more arbitrary cutoffs print( 'WARNING: no coordinates are observed with x_coord > {} or y_coord > {}. Did you scale the micrographs and particle coordinates correctly?' .format(x_max, y_max), file=sys.stderr) num_micrographs = sum(len(train_images[k]) for k in train_images.keys()) num_particles = len(train_targets) report('Loaded {} training micrographs with {} labeled particles'.format( num_micrographs, num_particles)) train_images, train_targets = match_images_targets(train_images, train_targets, radius) if test_images is not None: if os.path.isdir(test_images): paths = glob.glob(test_images + os.sep + '*' + image_ext) valid_paths = [] image_names = [] for path in paths: name = os.path.basename(path) name, ext = os.path.splitext(name) if ext in ['.mrc', '.tiff', '.png']: image_names.append(name) valid_paths.append(path) test_images = pd.DataFrame({ 'image_name': image_names, 'path': valid_paths }) else: test_images = pd.read_csv(test_images, sep='\t') #test_targets = pd.read_csv(test_targets, sep='\t') test_targets = file_utils.read_coordinates(test_targets, format=format_) # check for source columns if 'source' not in test_images and 'source' not in test_targets: test_images['source'] = 0 test_targets['source'] = 0 test_images = load_images_from_list(test_images.image_name, test_images.path, sources=test_images.source) # discard coordinates for micrographs not in the set of images # and warn the user if any are discarded names = set() for k, d in test_images.items(): for name in d.keys(): names.add(name) check = test_targets.image_name.apply(lambda x: x in names) missing = test_targets.image_name.loc[~check].unique().tolist() if len(missing) > 0: print( 'WARNING: {} micrographs listed in the coordinates file are missing from the test images. Image names are listed below.' .format(len(missing)), file=sys.stderr) print('WARNING: missing micrographs are: {}'.format(missing), file=sys.stderr) test_targets = test_targets.loc[check] num_micrographs = sum(len(test_images[k]) for k in test_images.keys()) num_particles = len(test_targets) report('Loaded {} test micrographs with {} labeled particles'.format( num_micrographs, num_particles)) test_images, test_targets = match_images_targets( test_images, test_targets, radius) elif k_fold > 1: ## seed for partitioning the data random = np.random.RandomState(cross_validation_seed) ## make the split train_images, train_targets, test_images, test_targets = cross_validation_split( k_fold, fold, train_images, train_targets, random=random) n_train = sum(len(images) for images in train_images) n_test = sum(len(images) for images in test_images) report('Split into {} train and {} test micrographs'.format( n_train, n_test)) return train_images, train_targets, test_images, test_targets
def main(args): ## initialize the model classifier = make_model(args) if args.describe: ## only print a description of the model and terminate print(classifier) sys.exit() ## set the device use_cuda = False if args.device >= 0: use_cuda = torch.cuda.is_available() if use_cuda: torch.cuda.set_device(args.device) report('Using device={} with cuda={}'.format(args.device, use_cuda)) if use_cuda: classifier.cuda() ## load the data radius = args.radius # number of pixels around coordinates to label as positive train_images, train_targets, test_images, test_targets = \ load_data(args.train_images, args.train_targets, args.test_images, args.test_targets, radius, k_fold=args.k_fold, fold=args.fold, cross_validation_seed=args.cross_validation_seed, ) num_positive_regions, total_regions = report_data_stats( train_images, train_targets, test_images, test_targets) ## make the training step method trainer, criteria, split = make_training_step_method( classifier, num_positive_regions, num_positive_regions / total_regions, args) ## training parameters train_iterator, test_iterator = make_data_iterators( train_images, train_targets, test_images, test_targets, classifier.width, split, args) ## fit the model, report train/test stats, save model if required output = sys.stdout if args.output is None else open(args.output, 'w') save_prefix = args.save_prefix #if not os.path.exists(os.path.dirname(save_prefix)): # os.makedirs(os.path.dirname(save_prefix)) fit_epochs(classifier, criteria, trainer, train_iterator, test_iterator, args.num_epochs, save_prefix=save_prefix, use_cuda=use_cuda, output=output) report('Done!')