def test_NewFromFileOfFiles( self ): """Pulls in the lymphoma eosin histology 5x6 tiled featureset via .sig files.""" # Types of files containing features: # FIT: contains an entire FeatureSpace definition including features. # FOF: "File Of Files" containing a FeatureSpace structure definition only, # listing paths to files of pre-calculated features (.sig) or the # tiff images themselves so features can be calculated # SIG: A text file containing pre-calculated features for a single sample. # Test dataset: subset of the IICBU2008 lymphoma dataset. 2 channels (H+E), # 3 classes ('CLL', 'FL', 'MCL'), 10 images per class per channel, # 5x6 tiling grid = 30 samples per image resulting in # 2 x 3 x 10 X 30 = 1800 total samples available # Files containing features included in this test suite: # 1. lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip: # A zip archive containing a single FIT file with features pre-calculated. # 2. lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip: # Contains 1800 SIG files, plus 4 FOF files (items 2-5 below): # "lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv" # "lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv" # "lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv" # "lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv" # List of possible feature sources: # 1. Single channel FIT (Eosin only) # 2. Single channel FOF (Eosin only) referencing to 30 tiffs (requires global sampling options -t5x6 -l to grab sigs) # 3. Single channel FOF (Eosin only) referencing 900 sig files # 4. Double channel FOF (Eosin+Haemotoxylin) referencing 60 tiffs (requires global sampling options -t5x6 -l to grab sigs) # 5. Double channel FOF (Eosin+Haemotoxylin) referencing 1800 sig files. #============================================= # BEGIN CODE TO CREATE TESTDATA ZIP PACKAGE #import zipfile #import zlib #path = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/TESTDATA_lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip' #zf = zipfile.ZipFile( path, mode='w' ) #import os #classes = 'CLL', 'FL', 'MCL', #channels = 'haemotoxylin', 'eosin' #from collections import defaultdict #sig_tracker = defaultdict(int) #samplegroupid_tracker = {} #samplegroup_counter = 0 # #eosin_tif_fof = [] # 30 lines #eosin_sig_fof = [] # 900 lines #double_tif_fof = [] # 30 lines, 2 feature set columns #double_sig_fof = [] # 900 lines, 2 feature set columns # #for _channel in channels: # zf.write( './' + _channel, compress_type=zipfile.ZIP_DEFLATED ) # for _class in classes: # zf.write( './' + _channel + '/' + _class, compress_type=zipfile.ZIP_DEFLATED ) # for root, dirs, files in os.walk( _channel + '/' + _class ): # for _file in files: # if _file.endswith( '.tif' ): # # Strip off the _H.tif or _E.tif # samplename = _file[:-6] # eosinpath = './eosin/' + _class + '/' + samplename + '_E.tif' # haemopath = './haemotoxylin/' + _class + '/' + samplename + '_H.tif' # if _channel == 'eosin': # eosin_tif_fof.append( eosinpath + '\t' + _class ) # double_tif_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t}') # elif _file.endswith( '.sig' ): # zf.write( './' + _channel + '/' + _class + '/' + _file, compress_type=zipfile.ZIP_DEFLATED ) # if _channel == 'eosin': # # Strip off the _H-t5x6_0_0-l.sig # samplename = _file[:-17] + '.tif' # eosinpath = './eosin/' + _class + '/' + _file # haemopath = './haemotoxylin/' + _class + '/' + _file.replace( '_E-t5x6_', '_H-t5x6_' ) # # count samples from 0: # samplesequenceid = str( sig_tracker[ samplename ] ) # sig_tracker[ samplename ] += 1 # if samplename not in samplegroupid_tracker: # samplegroupid_tracker[ samplename ] = samplegroup_counter # samplegroup_counter += 1 # samplegroupid = str( samplegroupid_tracker[ samplename ] ) # eosin_sig_fof.append( eosinpath + '\t' + _class ) # double_sig_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t') # #fof_dir = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/' #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', 'w') as out: # for _ in eosin_tif_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', 'w') as out: # for _ in eosin_sig_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', 'w') as out: # for _ in double_tif_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', 'w') as out: # for _ in double_sig_fof: # out.write( _ + '\n') #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.printdir() #zf.close() # END CODE TO CREATE TESTDATA ZIP PACKAGE #============================================= # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip' zf1 = zipfile.ZipFile( zipped_file_path, mode='r' ) tempdir = mkdtemp() zf1.extractall( tempdir ) # for comparison: zf2 = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r') zf2.extractall( tempdir ) try: kwargs = {} kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv' kwargs['quiet'] = True # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird. kwargs['long'] = True kwargs['tile_num_rows'] = 6 kwargs['tile_num_cols'] = 5 fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs ) kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit' fs_fit = FeatureSpace.NewFromFitFile( **kwargs ) # Fit file has less significant figures than Signature files, and it's not # consistent how many there are. Seems like fit file just lops off numbers # at the end. Example: (signatures on top, fit on bottom) # # Example: # - 17.232246, # sig # ? -- # # + 17.2322, # fit # - -63.549056, # sig # ? ^^^ # # + -63.5491, # fit # ? ^ # # - 223.786977, # sig # ? --- # # + 223.787, # fit # More of the same: #(Pdb) fs_fof.data_matrix[0,-5:] #array([ 0.935442, 14.005003, -43.562076, 127.394914, 0.628772]) #(Pdb) fs_fit.data_matrix[0,-5:] #array([ 0.935442, 14.005 , -43.5621 , 127.395 , 0.628772]) # default is rtol=1e-07, atol=0 #np.testing.assert_allclose( actual=fs_fit.data_matrix, desired=fs_fof.data_matrix, # rtol=1e-03, atol=0 ) #np.testing.assert_array_almost_equal_nulp( fs_fit.data_matrix, fs_fof.data_matrix ) for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )): retval = compare( fit_row, fof_row ) if retval == False: print "error in sample row", row_num print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num] self.assertTrue( retval ) # Test sorting; scramble the FOF then load and check: sorted_fof = tempdir + sep + \ 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv' with open( sorted_fof) as fof: lines = fof.readlines() from random import shuffle shuffle(lines) unsorted_fof = tempdir + sep + \ 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l_UNSORTED.fof.tsv' with open( unsorted_fof, 'w' ) as fof: for line in lines: fof.write( line ) kwargs = {} kwargs['pathname'] = unsorted_fof kwargs['quiet'] = True # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird. kwargs['long'] = True kwargs['tile_num_rows'] = 6 kwargs['tile_num_cols'] = 5 fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs ) # Check again for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )): retval = compare( fit_row, fof_row ) if retval == False: print "error in sample row", row_num print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num] self.assertTrue( retval ) # TESTING TAKE TILES: self.assertRaises( ValueError, fs_fof.TakeTiles, tuple() ) self.assertRaises( ValueError, fs_fof.TakeTiles, (45, 46, 47,) ) self.assertRaises( TypeError, fs_fof.TakeTiles, 'crap' ) # take middle 4 wanted_tiles = ( 14, 15, 20, 21 ) took = fs_fof.TakeTiles( wanted_tiles, inplace=False ) num_sample_groups = len( set( fs_fof._contiguous_sample_group_ids ) ) self.assertEqual( took.num_samples_per_group, len( wanted_tiles ) ) self.assertEqual( took.num_samples, len( wanted_tiles ) * num_sample_groups ) # mid4 = 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_MIDDLE_4_TILES_t5x6-l.fof.tsv' # # fake out wndcharm by putting empty tiffs in the temp dir # # we don't need them, the sigs are in there already. # with open( mid4) as fof: # lines = fof.readlines() # names, classes, paths, opts = zip( *[ _.split('\t') for _ in lines ] ) # for _path in paths: # with open( tempdir + sep + _path, 'w' ): # pass # took_via_fof = FeatureSpace.NewFromFileOfFiles( mid4, num_samples_per_group=4 ) # # for row_num, (fit_row, fof_row) in enumerate( zip( took.data_matrix, took_via_fof.data_matrix )): # retval = compare( fit_row, fof_row ) # if retval == False: # print "error in sample row", row_num # print "FIT: ", took._contiguous_sample_names[row_num], "FOF", took_via_fof._contiguous_sample_names[row_num] # self.assertTrue( retval ) finally: rmtree( tempdir )
args = parser.parse_args() num_splits = args.n num_bins = args.b input_filename = args.classifier_file_path[0] outpath = args.output_filepath dump_pickle = args.D if input_filename.endswith( ".fit" ): full_set = FeatureSpace.NewFromFitFile( input_filename ) elif input_filename.endswith( ".fit.pickled" ): full_set = FeatureSpace.NewFromPickleFile( input_filename ) elif input_filename.endswith( ".fof" ): full_set = FeatureSpace.NewFromFileOfFiles( input_filename ) else: raise Exception( 'The classifier must either end in .fit, .fit.pickled, or .fof' ) if not dump_pickle == 'unset': if dump_pickle: # user used -D to specify a name for their training set pickle full_set.PickleMe( dump_pickle ) else: # user used -D as a flag, use default pickle name pattern full_set.PickleMe() num_features_per_bin = int( float( len( full_set.feature_names ) ) / float( num_bins ) ) bin_offset = 0
def test_ParallelTiling(self): """Specify bounding box to FeatureVector, calc features, then compare with C++ implementation-calculated feats.""" import zipfile from shutil import copy from tempfile import NamedTemporaryFile refdir = mkdtemp(prefix='ref') targetdir = mkdtemp(prefix='target') try: reference_feats = pychrm_test_dir + sep + 'lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E_t6x5_REFERENCE_SIGFILES.zip' zf = zipfile.ZipFile(reference_feats, mode='r') zf.extractall(refdir) img_filename = "lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E.tif" orig_img_filepath = pychrm_test_dir + sep + img_filename # copy the tiff to the tempdir so the .sig files end up there too copy(orig_img_filepath, targetdir) copy(orig_img_filepath, refdir) input_image_path = targetdir + sep + img_filename with NamedTemporaryFile(mode='w', dir=refdir, prefix='ref', delete=False) as temp: ref_fof = temp.name temp.write('reference_samp\ttest_class\t{}\t{{}}\n'.format( refdir + sep + img_filename)) with NamedTemporaryFile(mode='w', dir=targetdir, prefix='target', delete=False) as temp: target_fof = temp.name temp.write( 'test_samp\ttest_class\t{}\t{{}}\n'.format(targetdir + sep + img_filename)) global_sampling_options = \ FeatureVector( long=True, tile_num_cols=6, tile_num_rows=5 ) # Should just load reference sigs ref_fs = FeatureSpace.NewFromFileOfFiles( ref_fof, quiet=False, global_sampling_options=global_sampling_options) target_fs = FeatureSpace.NewFromFileOfFiles( target_fof, n_jobs=True, quiet=False, global_sampling_options=global_sampling_options) #from numpy.testing import assert_allclose #self.assertTrue( assert_allclose( ref_fs.data_matrix, target_fs.data_matrix ) ) from wndcharm.utils import compare for row_num, (ref_row, test_row) in enumerate( zip(ref_fs.data_matrix, target_fs.data_matrix)): retval = compare(ref_row, test_row) if retval == False: print "error in sample row", row_num print "FIT: ", ref_fs._contiguous_sample_names[ row_num], "FOF", target_fs._contiguous_sample_names[ row_num] self.assertTrue(retval) finally: rmtree(refdir) rmtree(targetdir)