def test_save_read_csv(tmpdir): rows = [["col1", "col2"], ["foo", "bar"], ["foo", "bar"], ["foo", "bar"], ["foo", "bar"], ["foo", "bar"]] filepath = tmpdir.join("file.csv") save_csv(rows=rows, filepath=str(filepath)) rows_withoutheader = read_csv(filepath=str(filepath), header=True) rows_withheader = read_csv(filepath=str(filepath), header=False) assert rows == rows_withheader assert rows[1:] == rows_withoutheader
def test_convert(csv_of_volumes, tmp_path): files = io.read_csv(csv_of_volumes, skip_header=False) tfrecords_template = str(tmp_path / 'data-{shard:03d}.tfrecords') volumes_per_shard = 12 io.convert(files, tfrecords_template=tfrecords_template, volumes_per_shard=volumes_per_shard, num_parallel_calls=1) paths = list(tmp_path.glob('data-*.tfrecords')) paths = sorted(paths) assert len(paths) == 9 assert (tmp_path / 'data-008.tfrecords').is_file() dset = tf.data.TFRecordDataset(list(map(str, paths)), compression_type='GZIP') dset = dset.map( io.get_parse_fn(volume_shape=(8, 8, 8), include_affines=True)) for ref, test in zip(files, dset): x, y = ref x, x_aff = io.read_volume(x, return_affine=True) y, y_aff = io.read_volume(y, return_affine=True) assert np.array_equal(x, test[0]) assert np.array_equal(y, test[1]) assert np.array_equal(x_aff, test[2]) assert np.array_equal(y_aff, test[3]) with pytest.raises(ValueError): io.convert(files, tfrecords_template="data/foobar-{}.tfrecords")
def test_write_read_float_labels(csv_of_volumes, tmp_path): # noqa: F811 files = io.read_csv(csv_of_volumes, skip_header=False) files = [(x, random.random()) for x, _ in files] filename_template = str(tmp_path / "data-{shard:03d}.tfrecords") examples_per_shard = 12 tfrecord.write( files, filename_template=filename_template, examples_per_shard=examples_per_shard, processes=1, ) paths = list(tmp_path.glob("data-*.tfrecords")) paths = sorted(paths) assert len(paths) == 9 assert (tmp_path / "data-008.tfrecords").is_file() dset = tf.data.TFRecordDataset(list(map(str, paths)), compression_type="GZIP") dset = dset.map( tfrecord.parse_example_fn(volume_shape=(8, 8, 8), scalar_label=True)) for ref, test in zip(files, dset): x, y = ref x = io.read_volume(x) assert_array_equal(x, test[0]) assert_array_equal(y, test[1])
def test_write_read_volume_labels_all_processes(csv_of_volumes, tmp_path): # noqa: F811 files = io.read_csv(csv_of_volumes, skip_header=False) filename_template = str(tmp_path / "data-{shard:03d}.tfrecords") examples_per_shard = 12 tfrecord.write( files, filename_template=filename_template, examples_per_shard=examples_per_shard, processes=None, ) paths = list(tmp_path.glob("data-*.tfrecords")) paths = sorted(paths) assert len(paths) == 9 assert (tmp_path / "data-008.tfrecords").is_file() dset = tf.data.TFRecordDataset(list(map(str, paths)), compression_type="GZIP") dset = dset.map( tfrecord.parse_example_fn(volume_shape=(8, 8, 8), scalar_label=False)) for ref, test in zip(files, dset): x, y = ref x, y = io.read_volume(x), io.read_volume(y) assert_array_equal(x, test[0]) assert_array_equal(y, test[1]) with pytest.raises(ValueError): tfrecord.write(files, filename_template="data/foobar-{}.tfrecords", examples_per_shard=4)
def test_get_data(): csv_path = nbutils.get_data() assert Path(csv_path).is_file() files = read_csv(csv_path) assert len(files) == 10 assert all(len(r) == 2 for r in files) for x, y in files: assert Path(x).is_file() assert Path(y).is_file()
def test_read_csv(): with tempfile.NamedTemporaryFile() as f: f.write("foo,bar\nbaz,boo".encode()) f.seek(0) assert [("foo", "bar"), ("baz", "boo")] == io.read_csv(f.name, skip_header=False) with tempfile.NamedTemporaryFile() as f: f.write("foo,bar\nbaz,boo".encode()) f.seek(0) assert [("baz", "boo")] == io.read_csv(f.name, skip_header=True) with tempfile.NamedTemporaryFile() as f: f.write("foo,bar\nbaz,boo".encode()) f.seek(0) assert [("baz", "boo")] == io.read_csv(f.name) with tempfile.NamedTemporaryFile() as f: f.write("foo|bar\nbaz|boo".encode()) f.seek(0) assert [("baz", "boo")] == io.read_csv(f.name, delimiter="|")
def test_read_csv(): with tempfile.NamedTemporaryFile() as f: f.write('foo,bar\nbaz,boo'.encode()) f.seek(0) assert [('foo', 'bar'), ('baz', 'boo')] == io.read_csv(f.name, skip_header=False) with tempfile.NamedTemporaryFile() as f: f.write('foo,bar\nbaz,boo'.encode()) f.seek(0) assert [('baz', 'boo')] == io.read_csv(f.name, skip_header=True) with tempfile.NamedTemporaryFile() as f: f.write('foo,bar\nbaz,boo'.encode()) f.seek(0) assert [('baz', 'boo')] == io.read_csv(f.name) with tempfile.NamedTemporaryFile() as f: f.write('foo|bar\nbaz|boo'.encode()) f.seek(0) assert [('baz', 'boo')] == io.read_csv(f.name, delimiter='|')
def test_cli(csv_of_volumes): model_dir = "/tmp/tmpmodeldir" cmd = """train --n-classes=2 --model=highres3dnet --model-dir={model_dir} --optimizer=Adam --learning-rate=0.001 --batch-size=2 --prefetch=1 --volume-shape 8 8 8 --block-shape 8 8 8 --strides 8 8 8 --csv={filepath} --binarize --flip --rotate --gaussian --reduce-contrast --salt-and-pepper """ cmd = cmd.replace('\n', ' ').format(model_dir=model_dir, filepath=csv_of_volumes).split() main(args=cmd) assert Path(model_dir).is_dir() save_dir = "/tmp/tmpmodeldir/savedmodel" cmd = """save --model=highres3dnet --model-dir={model_dir} --n-classes=2 --block-shape 8 8 8 {save_dir} """ cmd = cmd.replace('\n', ' ').format(model_dir=model_dir, save_dir=save_dir).split() main(args=cmd) assert Path(save_dir).is_dir() save_dir = next(Path(save_dir).glob('**/saved_model.pb')) input_ = read_csv(csv_of_volumes)[0][0] output = "/tmp/output.nii.gz" cmd = """predict --block-shape 8 8 8 --model={save_dir} {input} {output} """ cmd = cmd.replace('\n', ' ').format(save_dir=save_dir, input=input_, output=output).split() main(cmd) read_volume(output)
def test_read_volume(csv_of_volumes): filepath = read_csv(csv_of_volumes)[0][0] volume = read_volume(filepath, dtype='float32', return_affine=False) assert volume.sum() assert volume.shape == (8, 8, 8) assert volume.dtype == np.float32 volume, affine = read_volume(filepath, dtype='int32', return_affine=True) assert volume.sum() assert volume.shape == (8, 8, 8) assert volume.dtype == np.int32 assert affine.shape == (4, 4) assert affine.sum() == 4
def test_verify_features_scalar_labels(csv_of_volumes): # noqa: F811 files = io.read_csv(csv_of_volumes, skip_header=False) # Int labels. files = [(x, 0) for (x, _) in files] invalid = io.verify_features_labels(files, volume_shape=(8, 8, 8), num_parallel_calls=1) assert not invalid invalid = io.verify_features_labels(files, volume_shape=(12, 12, 8), num_parallel_calls=1) assert all(invalid) # Float labels. files = [(x, 1.0) for (x, _) in files] invalid = io.verify_features_labels(files, volume_shape=(8, 8, 8), num_parallel_calls=1) assert not invalid invalid = io.verify_features_labels(files, volume_shape=(12, 12, 8), num_parallel_calls=1) assert all(invalid)
def validate(params): normalizer = None sm = params["samplewise_minmax"] sz = params["samplewise_zscore"] if sm and sz: raise Exception("Normalizer cannot be both minmax and zscore") if sm: normalizer = normalize_zero_one if sz: normalizer = zscore print(params['model']) validate_from_filepaths( filepaths=read_csv(params['csv']), predictor=params['model'], block_shape=params['block_shape'], n_classes=params['n_classes'], mapping_y=params['label_mapping'], output_path=params['output_path'], return_variance=params['return_variance'], return_entropy=params['return_entropy'], return_array_from_images=params['return_array_from_images'], n_samples=params['n_samples'], normalizer=normalizer, batch_size=params['batch_size'])
def test_convert_scalar_float_labels(tmp_path): runner = CliRunner() with runner.isolated_filesystem(): csvpath = get_data(str(tmp_path)) # Make labels scalars. data = [(x, 1.0) for (x, _) in read_csv(csvpath)] csvpath = tmp_path.with_suffix(".new.csv") with open(csvpath, "w", newline="") as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerows(data) tfrecords_template = Path("data/shard-{shard:03d}.tfrecords") tfrecords_template.parent.mkdir(exist_ok=True) args = """\ convert --csv={} --tfrecords-template={} --volume-shape 256 256 256 --examples-per-shard=2 --to-ras --no-verify-volumes """.format(csvpath, tfrecords_template) result = runner.invoke(climain.cli, args.split()) assert result.exit_code == 0 assert Path("data/shard-000.tfrecords").is_file() assert Path("data/shard-001.tfrecords").is_file() assert Path("data/shard-002.tfrecords").is_file() assert Path("data/shard-003.tfrecords").is_file() assert Path("data/shard-004.tfrecords").is_file() assert not Path("data/shard-005.tfrecords").is_file()
def train(params): """Train estimator.""" if params['aparcaseg_mapping']: tf.logging.info( "Reading mapping file: {}".format(params['aparcaseg_mapping'])) mapping = read_mapping(params['aparcaseg_mapping']) else: mapping = None def normalizer_aparcaseg(features, labels): return ( normalize_zero_one(features), preprocess_aparcaseg(labels, mapping)) def normalizer_brainmask(features, labels): return ( normalize_zero_one(features), binarize(labels, threshold=0)) if params['aparcaseg_mapping'] is not None: normalizer = normalizer_aparcaseg elif params['brainmask']: normalizer = normalizer_brainmask else: normalizer = None list_of_filepaths = read_csv(params['csv']) def generator_builder(): """Return a function that returns a generator.""" return iter_volumes( list_of_filepaths=list_of_filepaths, vol_shape=params['vol_shape'], block_shape=params['block_shape'], x_dtype=_DT_X_NP, y_dtype=_DT_Y_NP, strides=params['strides'], shuffle=True, normalizer=normalizer) _output_shapes = ( (*params['block_shape'], 1), params['block_shape']) input_fn = input_fn_builder( generator=generator_builder, output_types=(_DT_X_TF, _DT_Y_TF), output_shapes=_output_shapes, num_epochs=params['n_epochs'], batch_size=params['batch_size'], # TODO(kaczmarj): add multi-gpu support for training on volumes. # multi_gpu=params['multi_gpu'], # examples_per_epoch=examples_per_epoch, ) runconfig = tf.estimator.RunConfig( save_summary_steps=25, save_checkpoints_steps=500, keep_checkpoint_max=100) model = nobrainer.models.get_estimator(params['model'])( n_classes=params['n_classes'], optimizer=params['optimizer'], learning_rate=params['learning_rate'], model_dir=params['model_dir'], config=runconfig, multi_gpu=params['multi_gpu']) # Setup for training and periodic evaluation. if params['eval_csv'] is not None: eval_list_of_filepaths = read_csv(params['eval_csv']) gen = nobrainer.util.iter_volumes( list_of_filepaths=eval_list_of_filepaths, x_dtype=_DT_X_NP, y_dtype=_DT_Y_NP, vol_shape=params['vol_shape'], block_shape=params['block_shape'], strides=params['strides'], shuffle=False, normalizer=normalizer) def _get_eval_features_labels(): _features = [] _labels = [] for _f, _l in gen: _features.append(_f) _labels.append(_l) return np.stack(_features), np.stack(_labels) tf.logging.info("Loading evaluation data") _eval_features, _eval_labels = _get_eval_features_labels() eval_input_fn = tf.estimator.inputs.numpy_input_fn( x=_eval_features, y=_eval_labels, batch_size=2, num_epochs=1, shuffle=False) _monitors = [ tf.contrib.learn.monitors.ValidationMonitor( input_fn=eval_input_fn, every_n_steps=2000, early_stopping_metric=None, early_stopping_rounds=None)] hooks = tf.contrib.learn.monitors.replace_monitors_with_hooks( _monitors, model) # Training without evaluation. else: hooks = None model.train(input_fn=input_fn, hooks=hooks)
def train(params): model_config = tf.estimator.RunConfig( save_summary_steps=params['save_summary_steps'], save_checkpoints_steps=params['save_checkpoints_steps'], keep_checkpoint_max=params['keep_checkpoint_max']) model = get_estimator(params['model'])( n_classes=params['n_classes'], optimizer=params['optimizer'], learning_rate=params['learning_rate'], model_dir=params['model_dir'], config=model_config, multi_gpu=params['multi_gpu'], **params['model_opts']) label_mapping = None if params['label_mapping']: tf.logging.info( "Reading mapping file: {}".format(params['label_mapping'])) label_mapping = read_mapping(params['label_mapping']) filepaths = read_csv(params['csv']) volume_data_generator = VolumeDataGenerator( samplewise_minmax=params['samplewise_minmax'], samplewise_zscore=params['samplewise_zscore'], samplewise_center=params['samplewise_center'], samplewise_std_normalization=params['samplewise_std_normalization'], flip=params['flip'], rescale=params['rescale'], rotate=params['rotate'], gaussian=params['gaussian'], reduce_contrast=params['reduce_contrast'], salt_and_pepper=params['salt_and_pepper'], brightness_range=params['brightness_range'], shift_range=params['shift_range'], zoom_range=params['zoom_range'], binarize_y=params['binarize'], mapping_y=label_mapping) if params['eval_csv']: eval_filepaths = read_csv(params['eval_csv']) eval_volume_data_generator = VolumeDataGenerator( binarize_y=params['binarize'], mapping_y=label_mapping) else: eval_filepaths = None eval_volume_data_generator = None _train( model=model, volume_data_generator=volume_data_generator, filepaths=filepaths, volume_shape=params['volume_shape'], block_shape=params['block_shape'], strides=params['strides'], x_dtype='float32', y_dtype='int32', shuffle=True, batch_size=params['batch_size'], n_epochs=params['n_epochs'], prefetch=params['prefetch'], multi_gpu=params['multi_gpu'], eval_volume_data_generator=eval_volume_data_generator, eval_filepaths=eval_filepaths)
time_zero = time.time() namespace = parse_args(sys.argv[1:]) params = vars(namespace) if params['verbose'] >= 1: logger.setLevel(logging.DEBUG) elif params['verbose'] == 0: logger.setLevel(logging.INFO) if os.path.isdir(params['input']): logger.info("Assuming SUBJECTS_DIR was passed in. Findings file pairs") list_of_files = get_list_of_t1_aparcaseg(params['input']) elif os.path.isfile(params['input']): logger.info("Reading CSV") list_of_files = read_csv(params['input']) else: raise ValueError( "Input must be the path to an existing FreeSurfer SUBJECTS_DIR or" " to an existing CSV file.") logger.info("Found {} pairs of volumes".format(len(list_of_files))) logger.info("User requested chunk size of {}".format(params['chunksize'])) logger.info("Will iterate over {} set(s) of block shape(s)".format( len(params['block_shape']))) if params['save_filepaths'] is not None: _df = pd.DataFrame(list_of_files) _df.columns = ["features", "labels"] logger.info( "Saving CSV of filepaths found by this script to {}".format(
def test_verify_features_nonscalar_labels(csv_of_volumes): # noqa: F811 files = io.read_csv(csv_of_volumes, skip_header=False) invalid = io.verify_features_labels(files, volume_shape=(8, 8, 8), num_parallel_calls=1) assert not invalid
def convert( csv, preprocess_path, tfrecords_template, volume_shape, examples_per_shard, num_parallel_calls, verbose, ): """Preprocess MRI volumes and convert to Tfrecords. NOTE: Volumes will all be the same shape after preprocessing. """ volume_filepaths = read_csv(csv) num_parallel_calls = None if num_parallel_calls == -1 else num_parallel_calls if num_parallel_calls is None: # Get number of processes allocated to the current process. # Note the difference from `os.cpu_count()`. num_parallel_calls = len(os.sched_getaffinity(0)) invalid_pairs = verify_features_labels( volume_filepaths, check_labels_int=True, num_parallel_calls=num_parallel_calls, verbose=verbose, ) # UNCOMMENT the following when https://github.com/neuronets/nobrainer/pull/125 # is merged # if not invalid_pairs: # click.echo(click.style("Passed verification.", fg="green")) # else: # click.echo(click.style("Failed verification.", fg="red")) # for pair in invalid_pairs: # click.echo(pair[0]) # click.echo(pair[1]) # sys.exit(-1) ppaths = preprocess_parallel( volume_filepaths, conform_volume_to=volume_shape, num_parallel_calls=num_parallel_calls, save_path=preprocess_path, ) invalid_pairs = verify_features_labels( ppaths, volume_shape=volume_shape, check_labels_int=True, num_parallel_calls=num_parallel_calls, verbose=verbose, ) if not invalid_pairs: click.echo() else: click.echo(click.style("Failed post preprocessing re-verification.", fg="red")) click.echo( f"Oops! This is embarrasing. Looks like our preprocessing" " script shit the bed. Found {len(invalid_pairs)} invalid" " pairs of volumes. These files might not all have shape " " {volume_shape} or the labels might not be scalar values" " Please report this issue on " " https://github.com/poldracklab/nondefaced-detector " ) for pair in invalid_pairs: click.echo(pair[0]) click.echo(pair[1]) sys.exit(-1) # TODO: Convert to tfrecords os.makedirs(os.path.dirname(tfrecords_template), exist_ok=True) _write_tfrecord( features_labels=ppaths, filename_template=tfrecords_template, examples_per_shard=examples_per_shard, processes=num_parallel_calls, verbose=verbose, ) click.echo(click.style("Finished conversion to TFRecords.", fg="green"))
def test_verify_features_labels(csv_of_volumes): files = io.read_csv(csv_of_volumes, skip_header=False) io.verify_features_labels(files, volume_shape=(8, 8, 8), num_parallel_calls=1)