예제 #1
0
    def test_Normalize(self):
        """Load unnormalized feature space, normalize,
        then compare to stored normalized feature space."""

        from wndcharm.utils import compare
        result_fs = FeatureSpace.NewFromFitFile(
            self.test_fit_path).Normalize(inplace=True)
        target_fs = FeatureSpace.NewFromFitFile(self.test_normalized_fit_path)

        from numpy.testing import assert_allclose
        assert_allclose(result_fs.data_matrix,
                        target_fs.data_matrix,
                        rtol=1e-05)
예제 #2
0
    def test_LDATransform( self ):
	"""LDA transform"""

        tempdir = mkdtemp()
        import zipfile
        zf = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r')
        zf.extractall( tempdir )
        fitfile_path = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit'
        try:
            kwargs = {}
            kwargs['pathname'] = fitfile_path
            kwargs['quiet'] = True
            # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird.
            kwargs['long'] = True
            kwargs['tile_num_rows'] = 6
            kwargs['tile_num_cols'] = 5

            # against self:
            fs = FeatureSpace.NewFromFitFile( **kwargs )
            self_transformed = fs.LDATransform( reference_features=None, inplace=False )

            fit_on_fit_LDA_result = FeatureSpaceClassification.NewWND5(
                    self_transformed, self_transformed, feature_weights=None )

            # against other:
            train, test = fs.Split()
            train.LDATransform( reference_features=None, inplace=True )
            test.LDATransform( reference_features=train, inplace=True )

            split_LDA_result = FeatureSpaceClassification.NewWND5(
                    train, test, feature_weights=None )
        finally:
            rmtree( tempdir )
예제 #3
0
    def test_DiscreteTrainTestSplitWithTiling( self ):
        """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows:
        auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class,
        300 samples per class.
        """

        # Inflate the zipped test fit into a temp file
        import zipfile
        zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip'
        zf = zipfile.ZipFile( zipped_file_path, mode='r' )
        tempdir = mkdtemp()
        zf.extractall( tempdir )

        try:
            fitfilepath = tempdir + sep + zf.namelist()[0]
            #fs = FeatureSet.NewFromFitFile( fitfilepath  )
            fs = FeatureSpace.NewFromFitFile( fitfilepath, tile_num_rows=5, tile_num_cols=6 )
            from numpy.random import RandomState
            prng = RandomState(42)
            train, test = fs.Split( random_state=prng, quiet=True )
            train.Normalize( inplace=True, quiet=True )
            fw = FisherFeatureWeights.NewFromFeatureSpace( train ).Threshold()
            train.FeatureReduce( fw, inplace=True )
            test.FeatureReduce( fw, inplace=True ).Normalize( train, inplace=True, quiet=True )

        finally:
            rmtree( tempdir )
예제 #4
0
    def test_NewFromFeatureSet(self):
        """Fisher score calculation"""

        fs = FeatureSpace.NewFromFitFile(
            self.test_fit_path).Normalize(inplace=True)
        fw = FisherFeatureWeights.NewFromFeatureSpace(fs)

        # test weights generated from test-l.fit:
        # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit
        target_fw = FisherFeatureWeights.NewFromFile(
            self.test_feat_weight_path)

        for target_val, res_val in zip(target_fw.values, fw.values):
            self.assertAlmostEqual(target_val, res_val, delta=self.epsilon)

        # test slice operator

        orig_len = len(fw)

        sliced = fw[:10]
        self.assertEqual(len(sliced), 10)
        self.assertEqual(len(sliced.feature_names), 10)
        self.assertEqual(len(sliced.values), 10)

        for i in xrange(10):
            self.assertEqual(sliced.feature_names[i], fw.feature_names[i])
            self.assertEqual(sliced.values[i], fw.values[i])

        sliced = fw[50:100:2]

        for i, j in zip(range(len(sliced)), range(50, 100, 2)):
            self.assertEqual(sliced.feature_names[i], fw.feature_names[j])
            self.assertEqual(sliced.values[i], fw.values[j])
예제 #5
0
    def test_ContinuousFitOnFit(self):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        fs_discrete = CreateArtificialFeatureSpace_Discrete(
            n_samples=1000,
            n_classes=10,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        tempdir = mkdtemp()
        path_to_fit = tempdir + sep + 'Artificial.fit'

        try:
            fs_discrete.ToFitFile(path_to_fit)
            fs_continuous = FeatureSpace.NewFromFitFile(path_to_fit,
                                                        discrete=False)

            fs_continuous.Normalize(quiet=True)
            fw_reduced = PearsonFeatureWeights.NewFromFeatureSpace(
                fs_continuous).Threshold()
            fs_reduced = fs_continuous.FeatureReduce(fw_reduced)
            batch_result = FeatureSpaceRegression.NewMultivariateLinear(
                fs_reduced, fw_reduced, quiet=True)

        finally:
            rmtree(tempdir)
예제 #6
0
    def test_SamplesUnion(self):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        n_classes = 2
        fs1 = CreateArtificialFeatureSpace_Discrete(
            n_samples=20,
            n_classes=n_classes,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        fitfile_path = wndchrm_test_dir + sep + 'test-l.fit'
        fs2 = FeatureSpace.NewFromFitFile(fitfile_path)

        self.assertRaises(ValueError, fs1.SamplesUnion, other_fs=fs2)

        fs3 = CreateArtificialFeatureSpace_Discrete(
            n_samples=20,
            n_classes=n_classes,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        joined_fs = fs1 + fs3

        self.assertEqual(n_classes, joined_fs.num_classes)
    def test_FitOnFitClassification(self):

        fitfile_path = wndchrm_test_dir + sep + 'test-l.fit'
        #fs = FeatureSet.NewFromFitFile( fitfile_path )
        fs = FeatureSpace.NewFromFitFile(fitfile_path)
        fs.Normalize(inplace=True, quiet=True)
        fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold(438)
        fw.Print(50)
        fs.FeatureReduce(fw, inplace=True)
        pychrm_split = FeatureSpaceClassification.NewWND5(fs,
                                                          fs,
                                                          fw,
                                                          quiet=False)

        from wndcharm.FeatureSpacePredictionExperiment import FeatureSpaceClassificationExperiment
        html_path = pychrm_test_dir + sep + 'test-l_training_error_result.html'
        html_exp = FeatureSpaceClassificationExperiment.NewFromHTMLReport(
            html_path, quiet=False)
        # single split in this html
        html_split = html_exp.individual_results[0]
        for i, (html_result, pychrm_result) in enumerate( zip( html_split.individual_results,\
                pychrm_split.individual_results ) ):
            try:
                self.assertEqual(html_result, pychrm_result)
            except:
                outstr = "Error in comparison # {0}:\n".format(i)
                outstr += "HTML result:\n{0}\n Python API res:\n{1}".format(
                    html_result, pychrm_result)
                raise
예제 #8
0
    def test_Normalize( self ):
	"""Load unnormalized feature space, normalize,
        then compare to stored normalized feature space."""

        raw_fs = FeatureSpace.NewFromFitFile( self.test_fit_path ) 
        result_fs = raw_fs.Normalize( inplace=False )
        target_fs = FeatureSpace.NewFromFitFile( self.test_normalized_fit_path )

        from numpy.testing import assert_allclose
        assert_allclose( result_fs.data_matrix, target_fs.data_matrix, rtol=1e-05 )

        # Create reference Z-score feature space
        from scipy.stats.mstats import zscore
        from wndcharm.utils import ReplaceNonReal
        ReplaceNonReal( raw_fs.data_matrix )
        oldsettings = np.seterr( all='ignore' )
        target_fs = zscore( raw_fs.data_matrix )
        target_fs[ np.isnan( target_fs) ] = 0
        np.seterr( **oldsettings )

        result_fs = raw_fs.Normalize( inplace=False, zscore=True )
        assert_allclose( result_fs.data_matrix, target_fs )
예제 #9
0
    def test_WND5_all_features(self):
        epsilon = 0.00001

        # Define paths to original files
        test_sig_path = join(test_dir, 't1_s01_c05_ij-l_precalculated.sig')
        test_fit_path = join(test_dir, 'test-l.fit')
        test_feat_wght_path = join(test_dir, 'test_fit-l.weights')
        test_tif_path = join(test_dir, 't1_s01_c05_ij.tif')

        # Here are the correct values that Python API needs to return:
        # wndchrm classify -l -f0.75 test-l.fit t1_s01_c05_ij.tif
        # t1_s01_c05_ij.tif    1.6e-27    0.083    0.917    *    4cell    3.835
        # wndchrm classify -l test-l.fit t1_s01_c05_ij.tif
        # t1_s01_c05_ij.tif    3.19e-27    0.076    0.924    *    4cell    3.848
        # wndchrm classify -l -f0.05 test-l.fit t1_s01_c05_ij.tif
        # t1_s01_c05_ij.tif    1.06e-26    0.066    0.934    *    4cell    3.869

        correct_marg_probs = {}
        correct_marg_probs[2189] = [0.083, 0.917]
        correct_marg_probs[438] = [0.076, 0.924]
        correct_marg_probs[146] = [0.066, 0.934]

        # Load the original files once and only once for all this class's tests
        feature_set = FeatureSpace.NewFromFitFile(test_fit_path)
        fs1 = feature_set.feature_names
        feature_set.Normalize()
        fs2 = feature_set.feature_names
        self.assertSequenceEqual(fs1, fs2)

        test_sample = FeatureVector(source_filepath=test_tif_path, long=True)
        test_sample.LoadSigFile(test_sig_path)
        self.assertSequenceEqual(feature_set.feature_names,
                                 test_sample.feature_names)
        test_sample.Normalize(feature_set)

        all_weights = FisherFeatureWeights.NewFromFile(test_feat_wght_path)

        def Check(num_feats):
            weights = all_weights.Threshold(num_feats)
            feat_set = feature_set.FeatureReduce(weights)
            sample = test_sample.FeatureReduce(weights)
            result = SingleSampleClassification.NewWND5(
                feat_set, weights, sample)
            result_marg_probs = [ round( val, 3 ) \
                    for val in result.marginal_probabilities ]
            for target_val, res_val in zip(correct_marg_probs[num_feats],
                                           result_marg_probs):
                self.assertAlmostEqual(target_val, res_val, delta=epsilon)

        for num_feats in correct_marg_probs:
            Check(num_feats)
예제 #10
0
    def test_NewFromFeatureSet(self):
        """Fisher score calculation"""

        feature_set = FeatureSpace.NewFromFitFile(self.test_fit_path)
        feature_set.Normalize(inplace=True)
        result_weights = FisherFeatureWeights.NewFromFeatureSpace(feature_set)

        # test weights generated from test-l.fit:
        # wndchrm classify -l -f1.0 -vtest_fit-l.weights test-l.fit test-l.fit
        target_weights = FisherFeatureWeights.NewFromFile(
            self.test_feat_weight_path)

        for target_val, res_val in zip(target_weights.values,
                                       result_weights.values):
            self.assertAlmostEqual(target_val, res_val, delta=self.epsilon)
예제 #11
0
    def test_DiscreteTrainTestSplitNoTiling( self ):
        """Uses binucleate test set"""

        fitfilepath = wndchrm_test_dir + sep + 'test-l.fit'
        fs = FeatureSpace.NewFromFitFile( fitfilepath )

        from numpy.random import RandomState
        prng = RandomState(42)
        full_train, full_test = fs.Split( random_state=prng, quiet=True )
        full_train.Normalize( quiet=True )
        reduced_fw = FisherFeatureWeights.NewFromFeatureSpace( full_train ).Threshold()
        reduced_train = full_train.FeatureReduce( reduced_fw )

        reduced_test = full_test.FeatureReduce( reduced_fw )
        reduced_test.Normalize( reduced_train, quiet=True )

        batch_result = FeatureSpaceClassification.NewWND5( reduced_train,
            reduced_test, reduced_fw, quiet=True )
예제 #12
0
parser.add_argument( 'output_filepath', help='Results are written to this file, otherwise to STDOUT',
                     nargs='?')
parser.add_argument( '-D', help='Write the training set to a pickle file', metavar='<optional path>',
                     default='unset', nargs='?')

args = parser.parse_args()


num_splits = args.n
num_bins = args.b
input_filename = args.classifier_file_path[0]
outpath = args.output_filepath
dump_pickle = args.D

if input_filename.endswith( ".fit" ):
    full_set = FeatureSpace.NewFromFitFile( input_filename )
elif input_filename.endswith( ".fit.pickled" ):
    full_set = FeatureSpace.NewFromPickleFile( input_filename )
elif input_filename.endswith( ".fof" ):
    full_set = FeatureSpace.NewFromFileOfFiles( input_filename )
else:
    raise Exception( 'The classifier must either end in .fit, .fit.pickled, or .fof' )


if not dump_pickle == 'unset':
    if dump_pickle:
        # user used -D to specify a name for their training set pickle
        full_set.PickleMe( dump_pickle )
    else:
        # user used -D as a flag, use default pickle name pattern
        full_set.PickleMe()
예제 #13
0
    def test_NewFromFileOfFiles( self ):
        """Pulls in the lymphoma eosin histology 5x6 tiled featureset via .sig files."""

        # Types of files containing features:
        # FIT: contains an entire FeatureSpace definition including features.
        # FOF: "File Of Files" containing a FeatureSpace structure definition only,
        #      listing paths to files of pre-calculated features (.sig) or the
        #      tiff images themselves so features can be calculated
        # SIG: A text file containing pre-calculated features for a single sample.

        # Test dataset: subset of the IICBU2008 lymphoma dataset. 2 channels (H+E),
        #    3 classes ('CLL', 'FL', 'MCL'), 10 images per class per channel,
        #    5x6 tiling grid = 30 samples per image resulting in 
        #    2 x 3 x 10 X 30 = 1800 total samples available

        # Files containing features included in this test suite:
        # 1. lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip:
        #    A zip archive containing a single FIT file with features pre-calculated.
        # 2. lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip:
        #    Contains 1800 SIG files, plus 4 FOF files (items 2-5 below):
        #       "lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv"
        #       "lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv"
        #       "lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv"
        #       "lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv"

        # List of possible feature sources:
        #    1. Single channel FIT (Eosin only)
        #    2. Single channel FOF (Eosin only) referencing to 30 tiffs (requires global sampling options -t5x6 -l to grab sigs)
        #    3. Single channel FOF (Eosin only) referencing 900 sig files
        #    4. Double channel FOF (Eosin+Haemotoxylin) referencing 60 tiffs (requires global sampling options -t5x6 -l to grab sigs)
        #    5. Double channel FOF (Eosin+Haemotoxylin) referencing 1800 sig files.

        #=============================================
        # BEGIN CODE TO CREATE TESTDATA ZIP PACKAGE

        #import zipfile
        #import zlib
        #path = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/TESTDATA_lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip'
        #zf = zipfile.ZipFile( path, mode='w' )
        #import os
        #classes = 'CLL', 'FL', 'MCL',
        #channels = 'haemotoxylin', 'eosin'
        #from collections import defaultdict
        #sig_tracker = defaultdict(int)
        #samplegroupid_tracker = {}
        #samplegroup_counter = 0
        #
        #eosin_tif_fof = [] # 30 lines
        #eosin_sig_fof = [] # 900 lines
        #double_tif_fof = [] # 30 lines, 2 feature set columns
        #double_sig_fof = [] # 900 lines, 2 feature set columns
        #
        #for _channel in channels:
        #    zf.write( './' + _channel, compress_type=zipfile.ZIP_DEFLATED )
        #    for _class in classes:
        #        zf.write( './' + _channel + '/' + _class, compress_type=zipfile.ZIP_DEFLATED )
        #        for root, dirs, files in os.walk( _channel + '/' + _class ):
        #            for _file in files:
        #                if _file.endswith( '.tif' ):
        #                    # Strip off the _H.tif or _E.tif
        #                    samplename = _file[:-6]
        #                    eosinpath = './eosin/' + _class + '/' + samplename + '_E.tif'
        #                    haemopath = './haemotoxylin/' + _class + '/' + samplename + '_H.tif'
        #                    if _channel == 'eosin':
        #                        eosin_tif_fof.append( eosinpath + '\t' + _class )
        #                        double_tif_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t}')
        #                elif _file.endswith( '.sig' ):
        #                    zf.write( './' + _channel + '/' + _class + '/' + _file, compress_type=zipfile.ZIP_DEFLATED )
        #                    if _channel == 'eosin':
        #                        # Strip off the _H-t5x6_0_0-l.sig
        #                        samplename = _file[:-17] + '.tif'
        #                        eosinpath = './eosin/' + _class + '/' + _file
        #                        haemopath = './haemotoxylin/' + _class + '/' + _file.replace( '_E-t5x6_', '_H-t5x6_' )
        #                        # count samples from 0:
        #                        samplesequenceid = str( sig_tracker[ samplename ] )
        #                        sig_tracker[ samplename ] += 1
        #                        if samplename not in samplegroupid_tracker:
        #                            samplegroupid_tracker[ samplename ] = samplegroup_counter
        #                            samplegroup_counter += 1
        #                        samplegroupid = str( samplegroupid_tracker[ samplename ] )
        #                        eosin_sig_fof.append( eosinpath + '\t' + _class )
        #                        double_sig_fof.append( samplename + '\t' + _class + '\t' + eosinpath + '\t{\tchannel\t=\teosin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t' + haemopath + '\t{\tchannel\t=\thaemotoxylin\t;\tsamplegroupid\t=\t' + samplegroupid + '\t;\tsamplesequenceid\t=\t' + samplesequenceid + '\t}\t')
        #
        #fof_dir = '/Users/chris/src/wnd-charm/tests/pywndcharm_tests/'
        #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', 'w') as out:
        #    for _ in eosin_tif_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', 'w') as out:
        #    for _ in eosin_sig_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', 'w') as out:
        #    for _ in double_tif_fof:
        #        out.write( _ + '\n')
        #with open( 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', 'w') as out:
        #    for _ in double_sig_fof:
        #        out.write( _ + '\n')
        #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_images.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.write( './' + 'lymphoma_iicbu2008_subset_2CHAN_HE_sigfiles_t5x6-l.fof.tsv', compress_type=zipfile.ZIP_DEFLATED )
        #zf.printdir()
        #zf.close()

        # END CODE TO CREATE TESTDATA ZIP PACKAGE
        #=============================================

        # Inflate the zipped test fit into a temp file
        import zipfile
        
        zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_HE_t5x6_v3.2features_SIGFILES.zip'
        zf1 = zipfile.ZipFile( zipped_file_path, mode='r' )
        tempdir = mkdtemp()
        zf1.extractall( tempdir )

        # for comparison:
        zf2 = zipfile.ZipFile( pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip', mode='r')
        zf2.extractall( tempdir )

        try:
            kwargs = {}
            kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv'
            kwargs['quiet'] = True
            # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird.
            kwargs['long'] = True
            kwargs['tile_num_rows'] = 6
            kwargs['tile_num_cols'] = 5
            fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs )

            kwargs['pathname'] = tempdir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2features.fit'
            fs_fit = FeatureSpace.NewFromFitFile( **kwargs )

            # Fit file has less significant figures than Signature files, and it's not
            # consistent how many there are. Seems like fit file just lops off numbers
            # at the end. Example: (signatures on top, fit on bottom)
            #
            # Example:
            # -  17.232246,  # sig
            # ?         --
            #
            # +  17.2322,    # fit
            # -  -63.549056, # sig
            # ?         ^^^
            #
            # +  -63.5491,   # fit
            # ?         ^
            #
            # -  223.786977, # sig
            # ?        ---
            #
            # +  223.787,    # fit

            # More of the same:
            #(Pdb) fs_fof.data_matrix[0,-5:]
            #array([   0.935442,   14.005003,  -43.562076,  127.394914,    0.628772])
            #(Pdb) fs_fit.data_matrix[0,-5:]
            #array([   0.935442,   14.005   ,  -43.5621  ,  127.395   ,    0.628772])

            # default is rtol=1e-07, atol=0
            #np.testing.assert_allclose( actual=fs_fit.data_matrix, desired=fs_fof.data_matrix,
            #        rtol=1e-03, atol=0 )
            #np.testing.assert_array_almost_equal_nulp( fs_fit.data_matrix, fs_fof.data_matrix )
            for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )):
                retval = compare( fit_row, fof_row )
                if retval == False:
                    print "error in sample row", row_num
                    print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num]
                self.assertTrue( retval )


            # Test sorting; scramble the FOF then load and check:

            sorted_fof = tempdir + sep + \
                    'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l.fof.tsv'

            with open( sorted_fof) as fof:
                lines = fof.readlines()

            from random import shuffle
            shuffle(lines)

            unsorted_fof = tempdir + sep + \
                    'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_t5x6-l_UNSORTED.fof.tsv'

            with open( unsorted_fof, 'w' ) as fof:
                for line in lines:
                    fof.write( line )

            kwargs = {}
            kwargs['pathname'] = unsorted_fof
            kwargs['quiet'] = True
            # sampling opts: -l -t5x6 implies 5 columns and 6 rows ... I know it's weird.
            kwargs['long'] = True
            kwargs['tile_num_rows'] = 6
            kwargs['tile_num_cols'] = 5
            fs_fof = FeatureSpace.NewFromFileOfFiles( **kwargs )
            # Check again
            for row_num, (fit_row, fof_row) in enumerate( zip( fs_fit.data_matrix, fs_fof.data_matrix )):
                retval = compare( fit_row, fof_row )
                if retval == False:
                    print "error in sample row", row_num
                    print "FIT: ", fs_fit._contiguous_sample_names[row_num], "FOF", fs_fof._contiguous_sample_names[row_num]
                self.assertTrue( retval )

            # TESTING TAKE TILES:
            self.assertRaises( ValueError, fs_fof.TakeTiles, tuple() )
            self.assertRaises( ValueError, fs_fof.TakeTiles, (45, 46, 47,) )
            self.assertRaises( TypeError, fs_fof.TakeTiles, 'crap' )

            # take middle 4
            wanted_tiles = ( 14, 15, 20, 21 )

            took = fs_fof.TakeTiles( wanted_tiles, inplace=False )
            num_sample_groups = len( set( fs_fof._contiguous_sample_group_ids ) )
            self.assertEqual( took.num_samples_per_group, len( wanted_tiles ) )
            self.assertEqual( took.num_samples, len( wanted_tiles ) * num_sample_groups )

#            mid4 = 'lymphoma_iicbu2008_subset_EOSIN_ONLY_sigfiles_MIDDLE_4_TILES_t5x6-l.fof.tsv'
#            # fake out wndcharm by putting empty tiffs in the temp dir
#            # we don't need them, the sigs are in there already.
#            with open( mid4) as fof:
#                lines = fof.readlines()
#                names, classes, paths, opts = zip( *[ _.split('\t') for _ in lines ] )
#                for _path in paths:
#                    with open( tempdir + sep + _path, 'w' ):
#                        pass
#            took_via_fof = FeatureSpace.NewFromFileOfFiles( mid4, num_samples_per_group=4 )
#
#            for row_num, (fit_row, fof_row) in enumerate( zip( took.data_matrix, took_via_fof.data_matrix )):
#                retval = compare( fit_row, fof_row )
#                if retval == False:
#                    print "error in sample row", row_num
#                    print "FIT: ", took._contiguous_sample_names[row_num], "FOF", took_via_fof._contiguous_sample_names[row_num]
#                self.assertTrue( retval )


        finally:
            rmtree( tempdir )
    def test_FitOnFit(self):
        """Uses a curated subset of the IICBU 2008 Lymphoma dataset, preprocessed as follows:
        auto-deconvolved, eosin channel only, tiled 5x6, 3 classes, 10 imgs per class,
        300 samples per class.
        """

        # Inflate the zipped test fit into a temp file
        import zipfile
        zipped_file_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_EOSIN_ONLY_t5x6_v3.2features.fit.zip'
        zf = zipfile.ZipFile(zipped_file_path, mode='r')
        tempdir = mkdtemp()
        zf.extractall(tempdir)

        try:
            fitfilepath = tempdir + sep + zf.namelist()[0]

            # Do fit on fit WITHOUT tiling and compare with fit on fit results
            # generated with wndchrm 1.60
            fs = FeatureSpace.NewFromFitFile(fitfilepath).Normalize(
                inplace=True, quiet=True)
            #fs = FeatureSpace.NewFromFitFile( wndchrm_test_dir + sep + 'test-l.fit' )
            #fs.ToFitFile( 'temp.fit' )
            fw = FisherFeatureWeights.NewFromFeatureSpace(fs).Threshold()
            fs.FeatureReduce(fw, inplace=True)
            #            #fw.Print()
            #            #fs.Print(verbose=True)
            pychrm_res = FeatureSpaceClassification.NewWND5(fs, fs, fw)
            pychrm_res.Print()
            #
            #            import cProfile as pr
            #            #import profile as pr
            #            import tempfile
            #            import pstats
            #            prof = tempfile.NamedTemporaryFile()
            #            cmd = 'no_tile_pychrm_result = DiscreteBatchClassificationResult.New( reduced_fs, reduced_fs, fw )'
            #            pr.runctx( cmd, globals(), locals(), prof.name)
            #            p = pstats.Stats(prof.name)
            #            p.sort_stats('time').print_stats(20)
            #            prof.close()

            self.maxDiff = None

            html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_900_samples_TRAINING_ERROR.html'
            wres = FeatureSpaceClassificationExperiment.NewFromHTMLReport(
                html_path)
            wres.Print()
            wc_batch_result = wres.individual_results[
                0]  # only 1 split in fit-on-fit

            # This takes WAY too long:
            #self.assertSequenceEqual( wc_batch_result.individual_results, pychrm_res.individual_results )
            wc_result = np.empty((3 * len(wc_batch_result.individual_results)))
            for i, single_result in enumerate(
                    wc_batch_result.individual_results):
                wc_result[i * 3:(i + 1) *
                          3] = single_result.marginal_probabilities

            pc_result = np.empty((3 * len(pychrm_res.individual_results)))
            for i, single_result in enumerate(pychrm_res.individual_results):
                # HTML report only has 3 decimal places
                pc_result[ i*3 : (i+1)*3 ] = \
                    [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ]

            from numpy.testing import assert_allclose
            assert_allclose(actual=pc_result, desired=wc_result, atol=0.003)

            #wc_batch_result.Print()
            #pres.Print()

            # ==========================================================
            # Now do the same with tiling, reusing fs from before:

            num_samples_per_group = 30
            n_groups = fs.num_samples / num_samples_per_group
            new_sg_ids = [
                i for i in xrange(n_groups)
                for j in xrange(num_samples_per_group)
            ]
            fs.Update( tile_num_rows=5, tile_num_cols=6, num_samples_per_group=30,\
                    _contiguous_sample_group_ids=new_sg_ids )._RebuildViews()
            with_tile_pychrm_result = FeatureSpaceClassification.NewWND5(
                fs, fs, fw)
            html_path = pychrm_test_dir + sep + 'lymphoma_iicbu2008_subset_eosin_t5x6_v3.2feats_REFERENCE_RESULTS_30_samples_tiled_TRAINING_ERROR.html'
            with_tile_wndchrm_result = \
              FeatureSpaceClassificationExperiment.NewFromHTMLReport( html_path ).individual_results[0]

            #self.assertSequenceEqual( with_tile_pychrm_result.averaged_results, with_tile_wndchrm_result.individual_results )
            wc_result = np.empty(
                (3 * len(with_tile_wndchrm_result.individual_results)))
            for i, single_result in enumerate(
                    with_tile_wndchrm_result.individual_results):
                wc_result[i * 3:(i + 1) *
                          3] = single_result.marginal_probabilities

            pc_result = np.empty(
                (3 * len(with_tile_pychrm_result.averaged_results)))
            for i, single_result in enumerate(
                    with_tile_pychrm_result.averaged_results):
                # HTML report only has 3 decimal places
                pc_result[ i*3 : (i+1)*3 ] = \
                    [ float( "{0:0.3f}".format( val ) ) for val in single_result.marginal_probabilities ]

            assert_allclose(actual=pc_result, desired=wc_result, atol=0.003)

        finally:
            rmtree(tempdir)
예제 #15
0
    def test_TileOptions(self):

        fs = FeatureSpace.NewFromFitFile(wndchrm_test_dir + sep + 'test-l.fit',
                                         tile_options)