def test_NewFromDirectory( self ): """""" ref_sig_path = 'lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E-t6x5_5_4-l.sig' ref_fv = FeatureVector.NewFromSigFile( pychrm_test_dir + sep + ref_sig_path ) from shutil import copy tempdir = mkdtemp() img_filename = "lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E-t6x5_5_4-l.tiff" orig_img_filepath = pychrm_test_dir + sep + img_filename copy( orig_img_filepath, tempdir ) try: fs = FeatureSpace.NewFromDirectory( tempdir, quiet=False ) self.assertTrue( compare( fs.data_matrix[0], ref_fv.values ) ) #from numpy.testing import assert_allclose #assert_allclose( ref_fv.values, fs.data_matrix[0], rtol=1e-05 ) finally: rmtree( tempdir ) from os import mkdir toptempdir = mkdtemp() try: class_names = [] for letter in 'CBA': dirname = toptempdir + sep + letter mkdir( dirname ) copy( orig_img_filepath, dirname ) fs = FeatureSpace.NewFromDirectory( toptempdir, quiet=False, ) self.assertEqual( fs.class_names, ['A', 'B', 'C' ] ) for row_of_features in fs.data_matrix: self.assertTrue( compare( row_of_features, ref_fv.values ) ) finally: rmtree( toptempdir )
def test_Normalize(self): """Load unnormalized feature space, normalize, then compare to stored normalized feature space.""" from wndcharm.utils import compare result_fs = FeatureSpace.NewFromFitFile( self.test_fit_path).Normalize(inplace=True) target_fs = FeatureSpace.NewFromFitFile(self.test_normalized_fit_path) from numpy.testing import assert_allclose assert_allclose(result_fs.data_matrix, target_fs.data_matrix, rtol=1e-05)
def test_LDATransform( self ): """LDA transform""" tempdir = mkdtemp() import zipfile zf = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r') zf.extractall( tempdir ) fitfile_path = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit' try: kwargs = {} kwargs['pathname'] = fitfile_path kwargs['quiet'] = True # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird. kwargs['long'] = True kwargs['tile_num_rows'] = 6 kwargs['tile_num_cols'] = 5 # against self: fs = FeatureSpace.NewFromFitFile( **kwargs ) self_transformed = fs.LDATransform( reference_features=None, inplace=False ) fit_on_fit_LDA_result = FeatureSpaceClassification.NewWND5( self_transformed, self_transformed, feature_weights=None ) # against other: train, test = fs.Split() train.LDATransform( reference_features=None, inplace=True ) test.LDATransform( reference_features=train, inplace=True ) split_LDA_result = FeatureSpaceClassification.NewWND5( train, test, feature_weights=None ) finally: rmtree( tempdir )
def test_DiscreteTrainTestSplitWithTiling( self ): """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows: auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class, 300 samples per class. """ # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip' zf = zipfile.ZipFile( zipped_file_path, mode='r' ) tempdir = mkdtemp() zf.extractall( tempdir ) try: fitfilepath = tempdir + sep + zf.namelist()[0] #fs = FeatureSet.NewFromFitFile( fitfilepath ) fs = FeatureSpace.NewFromFitFile( fitfilepath, tile_num_rows=5, tile_num_cols=6 ) from numpy.random import RandomState prng = RandomState(42) train, test = fs.Split( random_state=prng, quiet=True ) train.Normalize( inplace=True, quiet=True ) fw = FisherFeatureWeights.NewFromFeatureSpace( train ).Threshold() train.FeatureReduce( fw, inplace=True ) test.FeatureReduce( fw, inplace=True ).Normalize( train, inplace=True, quiet=True ) finally: rmtree( tempdir )
def test_ContinuousFitOnFit(self): from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=10, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1, interpolatable=True) tempdir = mkdtemp() path_to_fit = tempdir + sep + 'Artificial.fit' try: fs_discrete.ToFitFile(path_to_fit) fs_continuous = FeatureSpace.NewFromFitFile(path_to_fit, discrete=False) fs_continuous.Normalize(quiet=True) fw_reduced = PearsonFeatureWeights.NewFromFeatureSpace( fs_continuous).Threshold() fs_reduced = fs_continuous.FeatureReduce(fw_reduced) batch_result = FeatureSpaceRegression.NewMultivariateLinear( fs_reduced, fw_reduced, quiet=True) finally: rmtree(tempdir)
def test_NewFromFeatureSet(self): """Fisher score calculation""" fs = FeatureSpace.NewFromFitFile( self.test_fit_path).Normalize(inplace=True) fw = FisherFeatureWeights.NewFromFeatureSpace(fs) # test weights generated from test-l.fit: # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit target_fw = FisherFeatureWeights.NewFromFile( self.test_feat_weight_path) for target_val, res_val in zip(target_fw.values, fw.values): self.assertAlmostEqual(target_val, res_val, delta=self.epsilon) # test slice operator orig_len = len(fw) sliced = fw[:10] self.assertEqual(len(sliced), 10) self.assertEqual(len(sliced.feature_names), 10) self.assertEqual(len(sliced.values), 10) for i in xrange(10): self.assertEqual(sliced.feature_names[i], fw.feature_names[i]) self.assertEqual(sliced.values[i], fw.values[i]) sliced = fw[50:100:2] for i, j in zip(range(len(sliced)), range(50, 100, 2)): self.assertEqual(sliced.feature_names[i], fw.feature_names[j]) self.assertEqual(sliced.values[i], fw.values[j])
def test_FitOnFitClassification(self): fitfile_path = wndchrm_test_dir + sep + 'test-l.fit' #fs = FeatureSet.NewFromFitFile( fitfile_path ) fs = FeatureSpace.NewFromFitFile(fitfile_path) fs.Normalize(inplace=True, quiet=True) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold(438) fw.Print(50) fs.FeatureReduce(fw, inplace=True) pychrm_split = FeatureSpaceClassification.NewWND5(fs, fs, fw, quiet=False) from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment html_path = pychrm_test_dir + sep + 'test-l_training_error_result.html' html_exp = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path, quiet=False) # single split in this html html_split = html_exp.individual_results[0] for i, (html_result, pychrm_result) in enumerate( zip( html_split.individual_results,\ pychrm_split.individual_results ) ): try: self.assertEqual(html_result, pychrm_result) except: outstr = "Error in comparison # {0}:\n".format(i) outstr += "HTML result:\n{0}\n Python API res:\n{1}".format( html_result, pychrm_result) raise
def test_SamplesUnion(self): from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete n_classes = 2 fs1 = CreateArtificialFeatureSpace_Discrete( n_samples=20, n_classes=n_classes, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1, interpolatable=True) fitfile_path = wndchrm_test_dir + sep + 'test-l.fit' fs2 = FeatureSpace.NewFromFitFile(fitfile_path) self.assertRaises(ValueError, fs1.SamplesUnion, other_fs=fs2) fs3 = CreateArtificialFeatureSpace_Discrete( n_samples=20, n_classes=n_classes, num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10, n_samples_per_group=1, interpolatable=True) joined_fs = fs1 + fs3 self.assertEqual(n_classes, joined_fs.num_classes)
def test_Normalize( self ): """Load unnormalized feature space, normalize, then compare to stored normalized feature space.""" raw_fs = FeatureSpace.NewFromFitFile( self.test_fit_path ) result_fs = raw_fs.Normalize( inplace=False ) target_fs = FeatureSpace.NewFromFitFile( self.test_normalized_fit_path ) from numpy.testing import assert_allclose assert_allclose( result_fs.data_matrix, target_fs.data_matrix, rtol=1e-05 ) # Create reference Z-score feature space from scipy.stats.mstats import zscore from wndcharm.utils import ReplaceNonReal ReplaceNonReal( raw_fs.data_matrix ) oldsettings = np.seterr( all='ignore' ) target_fs = zscore( raw_fs.data_matrix ) target_fs[ np.isnan( target_fs) ] = 0 np.seterr( **oldsettings ) result_fs = raw_fs.Normalize( inplace=False, zscore=True ) assert_allclose( result_fs.data_matrix, target_fs )
def test_WND5_all_features(self): epsilon = 0.00001 # Define paths to original files test_sig_path = join(test_dir, 't1_s01_c05_ij-l_precalculated.sig') test_fit_path = join(test_dir, 'test-l.fit') test_feat_wght_path = join(test_dir, 'test_fit-l.weights') test_tif_path = join(test_dir, 't1_s01_c05_ij.tif') # Here are the correct values that Python API needs to return: # wndchrm classify -l -f0.75 test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 1.6e-27 0.083 0.917 * 4cell 3.835 # wndchrm classify -l test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 3.19e-27 0.076 0.924 * 4cell 3.848 # wndchrm classify -l -f0.05 test-l.fit t1_s01_c05_ij.tif # t1_s01_c05_ij.tif 1.06e-26 0.066 0.934 * 4cell 3.869 correct_marg_probs = {} correct_marg_probs[2189] = [0.083, 0.917] correct_marg_probs[438] = [0.076, 0.924] correct_marg_probs[146] = [0.066, 0.934] # Load the original files once and only once for all this class's tests feature_set = FeatureSpace.NewFromFitFile(test_fit_path) fs1 = feature_set.feature_names feature_set.Normalize() fs2 = feature_set.feature_names self.assertSequenceEqual(fs1, fs2) test_sample = FeatureVector(source_filepath=test_tif_path, long=True) test_sample.LoadSigFile(test_sig_path) self.assertSequenceEqual(feature_set.feature_names, test_sample.feature_names) test_sample.Normalize(feature_set) all_weights = FisherFeatureWeights.NewFromFile(test_feat_wght_path) def Check(num_feats): weights = all_weights.Threshold(num_feats) feat_set = feature_set.FeatureReduce(weights) sample = test_sample.FeatureReduce(weights) result = SingleSampleClassification.NewWND5( feat_set, weights, sample) result_marg_probs = [ round( val, 3 ) \ for val in result.marginal_probabilities ] for target_val, res_val in zip(correct_marg_probs[num_feats], result_marg_probs): self.assertAlmostEqual(target_val, res_val, delta=epsilon) for num_feats in correct_marg_probs: Check(num_feats)
def test_NewFromFeatureSet(self): """Fisher score calculation""" feature_set = FeatureSpace.NewFromFitFile(self.test_fit_path) feature_set.Normalize(inplace=True) result_weights = FisherFeatureWeights.NewFromFeatureSpace(feature_set) # test weights generated from test-l.fit: # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit target_weights = FisherFeatureWeights.NewFromFile( self.test_feat_weight_path) for target_val, res_val in zip(target_weights.values, result_weights.values): self.assertAlmostEqual(target_val, res_val, delta=self.epsilon)
def test_DiscreteTrainTestSplitNoTiling( self ): """Uses binucleate test set""" fitfilepath = wndchrm_test_dir + sep + 'test-l.fit' fs = FeatureSpace.NewFromFitFile( fitfilepath ) from numpy.random import RandomState prng = RandomState(42) full_train, full_test = fs.Split( random_state=prng, quiet=True ) full_train.Normalize( quiet=True ) reduced_fw = FisherFeatureWeights.NewFromFeatureSpace( full_train ).Threshold() reduced_train = full_train.FeatureReduce( reduced_fw ) reduced_test = full_test.FeatureReduce( reduced_fw ) reduced_test.Normalize( reduced_train, quiet=True ) batch_result = FeatureSpaceClassification.NewWND5( reduced_train, reduced_test, reduced_fw, quiet=True )
def test_ClassSortSamplesByGroundTruth( self ): """If class names have interpolatable value, sort by that value, otherwise by class name alphabetical order""" fs = FeatureSpace( name='training set', num_samples=4, num_features=5) for i in (1,2,3,4,): fs.data_matrix[i-1].fill(i) fs.feature_names = [ "feat" + char for char in "ABCDE" ] fs._contiguous_sample_group_ids = range(4) fs._contiguous_sample_sequence_ids = [1]*4 fs._contiguous_sample_names = ['ones', 'twos', 'threes', 'fours'] fs._contiguous_ground_truth_labels = ['ones_1.0', 'twos_2.0', 'threes_3.0', 'fours_4.0'] fs._contiguous_ground_truth_values = [1,2,3,4,] fs.num_classes = 4 fs.class_names = ['ones_1.0', 'twos_2.0', 'threes_3.0', 'fours_4.0'] fs.interpolation_coefficients = [1.0,2.0,3.0,4.0] fs.SortSamplesByGroundTruth(inplace=True) # i.e., not ['fours', 'ones', 'threes', 'twos'] # aka, class alphabetical order self.assertEqual( fs.class_names, ['ones_1.0', 'twos_2.0', 'threes_3.0', 'fours_4.0'] ) fs.SortSamplesByGroundTruth(inplace=True, force_use_labels=True) self.assertEqual( fs.class_names, ['fours_4.0', 'ones_1.0', 'threes_3.0', 'twos_2.0'] )
def test_NewFromFileOfFiles( self ): """Pulls in the lymphoma eosin histology 5x6 tiled featureset via .sig files.""" # Types of files containing features: # FIT: contains an entire FeatureSpace definition including features. # FOF: "File Of Files" containing a FeatureSpace structure definition only, # listing paths to files of pre-calculated features (.sig) or the # tiff images themselves so features can be calculated # SIG: A text file containing pre-calculated features for a single sample. # Test dataset: subset of the IICBU2008 lymphoma dataset. 2 channels (H+E), # 3 classes ('CLL', 'FL', 'MCL'), 10 images per class per channel, # 5x6 tiling grid = 30 samples per image resulting in # 2 x 3 x 10 X 30 = 1800 total samples available # Files containing features included in this test suite: # 1. lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip: # A zip archive containing a single FIT file with features pre-calculated. # 2. lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip: # Contains 1800 SIG files, plus 4 FOF files (items 2-5 below): # "lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv" # "lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv" # "lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv" # "lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv" # List of possible feature sources: # 1. Single channel FIT (Eosin only) # 2. Single channel FOF (Eosin only) referencing to 30 tiffs (requires global sampling options -t5x6 -l to grab sigs) # 3. Single channel FOF (Eosin only) referencing 900 sig files # 4. Double channel FOF (Eosin+Haemotoxylin) referencing 60 tiffs (requires global sampling options -t5x6 -l to grab sigs) # 5. Double channel FOF (Eosin+Haemotoxylin) referencing 1800 sig files. #============================================= # BEGIN CODE TO CREATE TESTDATA ZIP PACKAGE #import zipfile #import zlib #path = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/TESTDATA_lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip' #zf = zipfile.ZipFile( path, mode='w' ) #import os #classes = 'CLL', 'FL', 'MCL', #channels = 'haemotoxylin', 'eosin' #from collections import defaultdict #sig_tracker = defaultdict(int) #samplegroupid_tracker = {} #samplegroup_counter = 0 # #eosin_tif_fof = [] # 30 lines #eosin_sig_fof = [] # 900 lines #double_tif_fof = [] # 30 lines, 2 feature set columns #double_sig_fof = [] # 900 lines, 2 feature set columns # #for _channel in channels: # zf.write( './' + _channel, compress_type=zipfile.ZIP_DEFLATED ) # for _class in classes: # zf.write( './' + _channel + '/' + _class, compress_type=zipfile.ZIP_DEFLATED ) # for root, dirs, files in os.walk( _channel + '/' + _class ): # for _file in files: # if _file.endswith( '.tif' ): # # Strip off the _H.tif or _E.tif # samplename = _file[:-6] # eosinpath = './eosin/' + _class + '/' + samplename + '_E.tif' # haemopath = './haemotoxylin/' + _class + '/' + samplename + '_H.tif' # if _channel == 'eosin': # eosin_tif_fof.append( eosinpath + '\t' + _class ) # double_tif_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t}') # elif _file.endswith( '.sig' ): # zf.write( './' + _channel + '/' + _class + '/' + _file, compress_type=zipfile.ZIP_DEFLATED ) # if _channel == 'eosin': # # Strip off the _H-t5x6_0_0-l.sig # samplename = _file[:-17] + '.tif' # eosinpath = './eosin/' + _class + '/' + _file # haemopath = './haemotoxylin/' + _class + '/' + _file.replace( '_E-t5x6_', '_H-t5x6_' ) # # count samples from 0: # samplesequenceid = str( sig_tracker[ samplename ] ) # sig_tracker[ samplename ] += 1 # if samplename not in samplegroupid_tracker: # samplegroupid_tracker[ samplename ] = samplegroup_counter # samplegroup_counter += 1 # samplegroupid = str( samplegroupid_tracker[ samplename ] ) # eosin_sig_fof.append( eosinpath + '\t' + _class ) # double_sig_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t') # #fof_dir = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/' #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', 'w') as out: # for _ in eosin_tif_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', 'w') as out: # for _ in eosin_sig_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', 'w') as out: # for _ in double_tif_fof: # out.write( _ + '\n') #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', 'w') as out: # for _ in double_sig_fof: # out.write( _ + '\n') #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED ) #zf.printdir() #zf.close() # END CODE TO CREATE TESTDATA ZIP PACKAGE #============================================= # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip' zf1 = zipfile.ZipFile( zipped_file_path, mode='r' ) tempdir = mkdtemp() zf1.extractall( tempdir ) # for comparison: zf2 = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r') zf2.extractall( tempdir ) try: kwargs = {} kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv' kwargs['quiet'] = True # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird. kwargs['long'] = True kwargs['tile_num_rows'] = 6 kwargs['tile_num_cols'] = 5 fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs ) kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit' fs_fit = FeatureSpace.NewFromFitFile( **kwargs ) # Fit file has less significant figures than Signature files, and it's not # consistent how many there are. Seems like fit file just lops off numbers # at the end. Example: (signatures on top, fit on bottom) # # Example: # - 17.232246, # sig # ? -- # # + 17.2322, # fit # - -63.549056, # sig # ? ^^^ # # + -63.5491, # fit # ? ^ # # - 223.786977, # sig # ? --- # # + 223.787, # fit # More of the same: #(Pdb) fs_fof.data_matrix[0,-5:] #array([ 0.935442, 14.005003, -43.562076, 127.394914, 0.628772]) #(Pdb) fs_fit.data_matrix[0,-5:] #array([ 0.935442, 14.005 , -43.5621 , 127.395 , 0.628772]) # default is rtol=1e-07, atol=0 #np.testing.assert_allclose( actual=fs_fit.data_matrix, desired=fs_fof.data_matrix, # rtol=1e-03, atol=0 ) #np.testing.assert_array_almost_equal_nulp( fs_fit.data_matrix, fs_fof.data_matrix ) for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )): retval = compare( fit_row, fof_row ) if retval == False: print "error in sample row", row_num print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num] self.assertTrue( retval ) # Test sorting; scramble the FOF then load and check: sorted_fof = tempdir + sep + \ 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv' with open( sorted_fof) as fof: lines = fof.readlines() from random import shuffle shuffle(lines) unsorted_fof = tempdir + sep + \ 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l_UNSORTED.fof.tsv' with open( unsorted_fof, 'w' ) as fof: for line in lines: fof.write( line ) kwargs = {} kwargs['pathname'] = unsorted_fof kwargs['quiet'] = True # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird. kwargs['long'] = True kwargs['tile_num_rows'] = 6 kwargs['tile_num_cols'] = 5 fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs ) # Check again for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )): retval = compare( fit_row, fof_row ) if retval == False: print "error in sample row", row_num print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num] self.assertTrue( retval ) # TESTING TAKE TILES: self.assertRaises( ValueError, fs_fof.TakeTiles, tuple() ) self.assertRaises( ValueError, fs_fof.TakeTiles, (45, 46, 47,) ) self.assertRaises( TypeError, fs_fof.TakeTiles, 'crap' ) # take middle 4 wanted_tiles = ( 14, 15, 20, 21 ) took = fs_fof.TakeTiles( wanted_tiles, inplace=False ) num_sample_groups = len( set( fs_fof._contiguous_sample_group_ids ) ) self.assertEqual( took.num_samples_per_group, len( wanted_tiles ) ) self.assertEqual( took.num_samples, len( wanted_tiles ) * num_sample_groups ) # mid4 = 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_MIDDLE_4_TILES_t5x6-l.fof.tsv' # # fake out wndcharm by putting empty tiffs in the temp dir # # we don't need them, the sigs are in there already. # with open( mid4) as fof: # lines = fof.readlines() # names, classes, paths, opts = zip( *[ _.split('\t') for _ in lines ] ) # for _path in paths: # with open( tempdir + sep + _path, 'w' ): # pass # took_via_fof = FeatureSpace.NewFromFileOfFiles( mid4, num_samples_per_group=4 ) # # for row_num, (fit_row, fof_row) in enumerate( zip( took.data_matrix, took_via_fof.data_matrix )): # retval = compare( fit_row, fof_row ) # if retval == False: # print "error in sample row", row_num # print "FIT: ", took._contiguous_sample_names[row_num], "FOF", took_via_fof._contiguous_sample_names[row_num] # self.assertTrue( retval ) finally: rmtree( tempdir )
def test_FitOnFit(self): """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows: auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class, 300 samples per class. """ # Inflate the zipped test fit into a temp file import zipfile zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip' zf = zipfile.ZipFile(zipped_file_path, mode='r') tempdir = mkdtemp() zf.extractall(tempdir) try: fitfilepath = tempdir + sep + zf.namelist()[0] # Do fit on fit WITHOUT tiling and compare with fit on fit results # generated with wndchrm 1.60 fs = FeatureSpace.NewFromFitFile(fitfilepath).Normalize( inplace=True, quiet=True) #fs = FeatureSpace.NewFromFitFile( wndchrm_test_dir + sep + 'test-l.fit' ) #fs.ToFitFile( 'temp.fit' ) fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold() fs.FeatureReduce(fw, inplace=True) # #fw.Print() # #fs.Print(verbose=True) pychrm_res = FeatureSpaceClassification.NewWND5(fs, fs, fw) pychrm_res.Print() # # import cProfile as pr # #import profile as pr # import tempfile # import pstats # prof = tempfile.NamedTemporaryFile() # cmd = 'no_tile_pychrm_result = DiscreteBatchClassificationResult.New( reduced_fs, reduced_fs, fw )' # pr.runctx( cmd, globals(), locals(), prof.name) # p = pstats.Stats(prof.name) # p.sort_stats('time').print_stats(20) # prof.close() self.maxDiff = None html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_900_samples_TRAINING_ERROR.html' wres = FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path) wres.Print() wc_batch_result = wres.individual_results[ 0] # only 1 split in fit-on-fit # This takes WAY too long: #self.assertSequenceEqual( wc_batch_result.individual_results, pychrm_res.individual_results ) wc_result = np.empty((3 * len(wc_batch_result.individual_results))) for i, single_result in enumerate( wc_batch_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty((3 * len(pychrm_res.individual_results))) for i, single_result in enumerate(pychrm_res.individual_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] from numpy.testing import assert_allclose assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) #wc_batch_result.Print() #pres.Print() # ========================================================== # Now do the same with tiling, reusing fs from before: num_samples_per_group = 30 n_groups = fs.num_samples / num_samples_per_group new_sg_ids = [ i for i in xrange(n_groups) for j in xrange(num_samples_per_group) ] fs.Update( tile_num_rows=5, tile_num_cols=6, num_samples_per_group=30,\ _contiguous_sample_group_ids=new_sg_ids )._RebuildViews() with_tile_pychrm_result = FeatureSpaceClassification.NewWND5( fs, fs, fw) html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_30_samples_tiled_TRAINING_ERROR.html' with_tile_wndchrm_result = \ FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path ).individual_results[0] #self.assertSequenceEqual( with_tile_pychrm_result.averaged_results, with_tile_wndchrm_result.individual_results ) wc_result = np.empty( (3 * len(with_tile_wndchrm_result.individual_results))) for i, single_result in enumerate( with_tile_wndchrm_result.individual_results): wc_result[i * 3:(i + 1) * 3] = single_result.marginal_probabilities pc_result = np.empty( (3 * len(with_tile_pychrm_result.averaged_results))) for i, single_result in enumerate( with_tile_pychrm_result.averaged_results): # HTML report only has 3 decimal places pc_result[ i*3 : (i+1)*3 ] = \ [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ] assert_allclose(actual=pc_result, desired=wc_result, atol=0.003) finally: rmtree(tempdir)
def test_ParallelTiling(self): """Specify bounding box to FeatureVector, calc features, then compare with C++ implementation-calculated feats.""" import zipfile from shutil import copy from tempfile import NamedTemporaryFile refdir = mkdtemp(prefix='ref') targetdir = mkdtemp(prefix='target') try: reference_feats = pychrm_test_dir + sep + 'lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E_t6x5_REFERENCE_SIGFILES.zip' zf = zipfile.ZipFile(reference_feats, mode='r') zf.extractall(refdir) img_filename = "lymphoma_eosin_channel_MCL_test_img_sj-05-3362-R2_001_E.tif" orig_img_filepath = pychrm_test_dir + sep + img_filename # copy the tiff to the tempdir so the .sig files end up there too copy(orig_img_filepath, targetdir) copy(orig_img_filepath, refdir) input_image_path = targetdir + sep + img_filename with NamedTemporaryFile(mode='w', dir=refdir, prefix='ref', delete=False) as temp: ref_fof = temp.name temp.write('reference_samp\ttest_class\t{}\t{{}}\n'.format( refdir + sep + img_filename)) with NamedTemporaryFile(mode='w', dir=targetdir, prefix='target', delete=False) as temp: target_fof = temp.name temp.write( 'test_samp\ttest_class\t{}\t{{}}\n'.format(targetdir + sep + img_filename)) global_sampling_options = \ FeatureVector( long=True, tile_num_cols=6, tile_num_rows=5 ) # Should just load reference sigs ref_fs = FeatureSpace.NewFromFileOfFiles( ref_fof, quiet=False, global_sampling_options=global_sampling_options) target_fs = FeatureSpace.NewFromFileOfFiles( target_fof, n_jobs=True, quiet=False, global_sampling_options=global_sampling_options) #from numpy.testing import assert_allclose #self.assertTrue( assert_allclose( ref_fs.data_matrix, target_fs.data_matrix ) ) from wndcharm.utils import compare for row_num, (ref_row, test_row) in enumerate( zip(ref_fs.data_matrix, target_fs.data_matrix)): retval = compare(ref_row, test_row) if retval == False: print "error in sample row", row_num print "FIT: ", ref_fs._contiguous_sample_names[ row_num], "FOF", target_fs._contiguous_sample_names[ row_num] self.assertTrue(retval) finally: rmtree(refdir) rmtree(targetdir)
parser.add_argument( 'output_filepath', help='Results are written to this file, otherwise to STDOUT', nargs='?') parser.add_argument( '-D', help='Write the training set to a pickle file', metavar='<optional path>', default='unset', nargs='?') args = parser.parse_args() num_splits = args.n num_bins = args.b input_filename = args.classifier_file_path[0] outpath = args.output_filepath dump_pickle = args.D if input_filename.endswith( ".fit" ): full_set = FeatureSpace.NewFromFitFile( input_filename ) elif input_filename.endswith( ".fit.pickled" ): full_set = FeatureSpace.NewFromPickleFile( input_filename ) elif input_filename.endswith( ".fof" ): full_set = FeatureSpace.NewFromFileOfFiles( input_filename ) else: raise Exception( 'The classifier must either end in .fit, .fit.pickled, or .fof' ) if not dump_pickle == 'unset': if dump_pickle: # user used -D to specify a name for their training set pickle full_set.PickleMe( dump_pickle ) else: # user used -D as a flag, use default pickle name pattern full_set.PickleMe()
def test_TileOptions(self): fs = FeatureSpace.NewFromFitFile(wndchrm_test_dir + sep + 'test-l.fit', tile_options)