Exemplo n.º 1
0
    def test_ContinuousFitOnFit(self):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        fs_discrete = CreateArtificialFeatureSpace_Discrete(
            n_samples=1000,
            n_classes=10,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        tempdir = mkdtemp()
        path_to_fit = tempdir + sep + 'Artificial.fit'

        try:
            fs_discrete.ToFitFile(path_to_fit)
            fs_continuous = FeatureSpace.NewFromFitFile(path_to_fit,
                                                        discrete=False)

            fs_continuous.Normalize(quiet=True)
            fw_reduced = PearsonFeatureWeights.NewFromFeatureSpace(
                fs_continuous).Threshold()
            fs_reduced = fs_continuous.FeatureReduce(fw_reduced)
            batch_result = FeatureSpaceRegression.NewMultivariateLinear(
                fs_reduced, fw_reduced, quiet=True)

        finally:
            rmtree(tempdir)
Exemplo n.º 2
0
    def test_RemoveClass( self ):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        n_classes = 10
        fs = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1, interpolatable=True )
        orig_num_features = fs.num_features

        #print "BEFORE:", str( fs )
        fs.RemoveClass( 5, inplace=True )
        #print "AFTER:", str( fs )

        self.assertEqual( fs.num_classes, n_classes - 1 )

        # Now, the list of classes are:
        # ['FakeClass-100', 'FakeClass-77.78', 'FakeClass-55.56', 'FakeClass-33.33', 'FakeClass-11.11', 'FakeClass33.33', 'FakeClass55.56', 'FakeClass77.78', 'FakeClass100']
        # "FakeClass33.33" is the 5th class, so essentially we're trying to remove the
        # 5th class twice:

        fs.RemoveClass( "FakeClass+033.3" , inplace=True )
        #print "AFTER AFTER:", str( fs )

        self.assertEqual( fs.num_classes, n_classes - 2 )
        # Each class has 100 samples, and we've removed 2 classes:
        self.assertEqual( fs.num_samples, 800 )
        self.assertEqual( fs.shape[0], 800 )
        self.assertEqual( fs.num_features, orig_num_features )

        self.assertRaises( ValueError, fs.RemoveClass, class_token='trash' )
        self.assertRaises( IndexError, fs.RemoveClass, class_token=10 )
Exemplo n.º 3
0
    def test_SamplesUnion(self):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        n_classes = 2
        fs1 = CreateArtificialFeatureSpace_Discrete(
            n_samples=20,
            n_classes=n_classes,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        fitfile_path = wndchrm_test_dir + sep + 'test-l.fit'
        fs2 = FeatureSpace.NewFromFitFile(fitfile_path)

        self.assertRaises(ValueError, fs1.SamplesUnion, other_fs=fs2)

        fs3 = CreateArtificialFeatureSpace_Discrete(
            n_samples=20,
            n_classes=n_classes,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True)

        joined_fs = fs1 + fs3

        self.assertEqual(n_classes, joined_fs.num_classes)
Exemplo n.º 4
0
    def test_SplitOptions( self ):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=10,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1, interpolatable=True, random_state=42)

        # default
        train_set, test_set = fs_discrete.Split( random_state=42, quiet=True )
        self.assertEqual( train_set.shape, (750, 600) )
        self.assertEqual( test_set.shape, (250, 600) )

        # Supposed to only return single FeatureSpace instead of 2-tuple of FeatureSpace
        # when setting test_size = 0
        i = 50
        retval = fs_discrete.Split( train_size=i, test_size=0, random_state=42, quiet=True )
        self.assertEqual( type(retval), FeatureSpace )
        self.assertEqual( retval.num_samples, i * fs_discrete.num_classes )

        # dummyproofing

        self.assertRaises( ValueError, fs_discrete.Split, train_size='trash' )
        self.assertRaises( ValueError, fs_discrete.Split, train_size=1.1 )
        self.assertRaises( ValueError, fs_discrete.Split, test_size='trash' )
        self.assertRaises( ValueError, fs_discrete.Split, test_size=1.1 )

        # What if the feature set number of groups within a class are less than called for
        # when specifying by integer?
        self.assertRaises( ValueError, test_set.Split, test_size=25 )

        # What happens when input fs has unbalanced classes, some of which have enough
        # to satisfy train_size/test_size params, and some don't
        remove_these = range(250,300) + range(700,750)
        fs_class_2_and_7_smaller = \
              fs_discrete.SampleReduce( leave_out_sample_group_ids=remove_these )

        self.assertRaises( ValueError, fs_class_2_and_7_smaller.Split, train_size=80,
                           test_size=20 )

        # Test balanced_classes:
        train_fs, test_fs = fs_class_2_and_7_smaller.Split()
        # Training set number rounds down (apparently).
        from math import floor
        expected_num_samps_per_train_class = int( floor(50*0.75) )
        expected_num_samps_per_test_class = 50 - expected_num_samps_per_train_class

        err_msg = "Balanced classes {} set split error, class {}, expected {}, got {}"
        for i, (n_train, n_test) in enumerate( zip( train_fs.class_sizes, test_fs.class_sizes )):
            self.assertEqual( n_train, expected_num_samps_per_train_class, msg=\
                    err_msg.format( "TRAIN", i, expected_num_samps_per_train_class, n_train  ) )
            self.assertEqual( n_test, expected_num_samps_per_test_class, msg=\
                    err_msg.format( "TEST", i, expected_num_samps_per_test_class, n_test ) )
Exemplo n.º 5
0
    def test_SplitOptions(self):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        fs_discrete = CreateArtificialFeatureSpace_Discrete(
            n_samples=1000,
            n_classes=10,
            num_features_per_signal_type=30,
            noise_gradient=5,
            initial_noise_sigma=10,
            n_samples_per_group=1,
            interpolatable=True,
            random_state=42)

        # default
        train_set, test_set = fs_discrete.Split(random_state=42, quiet=True)
        self.assertEqual(train_set.shape, (750, 600))
        self.assertEqual(test_set.shape, (250, 600))

        # Supposed to only return single FeatureSpace instead of 2-tuple of FeatureSpace
        # when setting test_size = 0
        i = 50
        retval = fs_discrete.Split(train_size=i,
                                   test_size=0,
                                   random_state=42,
                                   quiet=True)
        self.assertEqual(type(retval), FeatureSpace)
        self.assertEqual(retval.num_samples, i * fs_discrete.num_classes)

        # dummyproofing

        self.assertRaises(ValueError, fs_discrete.Split, train_size='trash')
        self.assertRaises(ValueError, fs_discrete.Split, train_size=1.1)
        self.assertRaises(ValueError, fs_discrete.Split, test_size='trash')
        self.assertRaises(ValueError, fs_discrete.Split, test_size=1.1)

        # What if the feature set number of groups within a class are less than called for
        # when specifying by integer?
        self.assertRaises(ValueError, test_set.Split, test_size=25)

        # What happens when input fs has unbalanced classes, some of which have enough
        # to satisfy train_size/test_size params, and some don't
        remove_these = range(250, 300) + range(700, 750)
        fs_class_2_and_7_smaller = \
              fs_discrete.SampleReduce( leave_out_sample_group_ids=remove_these )

        self.assertRaises(ValueError,
                          fs_class_2_and_7_smaller.Split,
                          train_size=80,
                          test_size=20)
Exemplo n.º 6
0
    def test_AccuracyVersusNumFeaturesGraph( self ):
        """Accuracy vs. # features with and without LDA feature space transform"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100 # smaller
        fs_kwargs['n_classes'] = 5 # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10 # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete( **fs_kwargs )

        ss_kwargs = {}
        ss_kwargs['quiet'] = False
        ss_kwargs['feature_space'] = small_fs
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18 # per-class
        ss_kwargs['test_size' ] = test_size = 2 # per-class
        ss_kwargs['random_state'] = 42
        graph = AccuracyVersusNumFeaturesGraph( **ss_kwargs )
Exemplo n.º 7
0
    def test_FromDiscreteClassificationExperimentResults(self):
        """Rank Ordered Predicted values graph from an experiment result (multiple splits)"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100  # smaller
        fs_kwargs['n_classes'] = 5  # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10  # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            small_fs, **ss_kwargs)
        graph = PredictedValuesGraph(exp, use_averaged_results=False)
        graph.RankOrderedPredictedValuesGraph()
        self.CompareGraphs(graph, testfilename)
Exemplo n.º 8
0
    def test_PerSampleStatisticsWITHPredictedValue(self):
        """DISCRETE PerSampleStatistics with numeric predicted value"""

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 40
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)
        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
Exemplo n.º 9
0
    def test_NumFeaturesGridSearch(self):

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 250
        fs_kwargs['n_classes'] = 10
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 5
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['feature_space'] = fs
        ss_kwargs['quiet'] = False
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['random_state'] = 42
        ss_kwargs[
            'conserve_mem'] = False  # otherwise the input fs will be modified

        FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs)
        ss_kwargs['lda'] = True
        ss_kwargs['pre_lda_feature_filter'] = True
        #import pdb; pdb.set_trace()
        FeatureSpaceClassificationExperiment.NumFeaturesGridSearch(**ss_kwargs)
Exemplo n.º 10
0
    def test_IfNotInterpolatable( self ):
        """You can't graph predicted values if the classes aren't interpolatable."""

        testfilename = 'ShouldntBeGraphable.png'
        small_fs = CreateArtificialFeatureSpace_Discrete( 
                        n_samples=20, n_classes=2, random_state=42, interpolatable=False )
        train_set, test_set = small_fs.Split( random_state=False, quiet=True )
        train_set.Normalize()

        fw = FisherFeatureWeights.NewFromFeatureSpace( train_set ).Threshold()
        reduced_train_set = train_set.FeatureReduce( fw )
        reduced_test_set = test_set.FeatureReduce( fw )
        test_set.Normalize( train_set, quiet=True )

        batch_result = FeatureSpaceClassification.NewWND5(
                                    reduced_train_set, reduced_test_set, fw, quiet=True )
        with self.assertRaises( ValueError ):
            graph = PredictedValuesGraph( batch_result )
    def test_TiledTrainTestSplit(self):
        """Uses a fake FeatureSpace"""

        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS 10-class"
        fs_kwargs['n_samples'] = 1000
        fs_kwargs['n_classes'] = 10  # 100 samples per class
        fs_kwargs['num_features_per_signal_type'] = 25
        fs_kwargs['initial_noise_sigma'] = 40
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 4  # 25 images, 2x2 tiling scheme
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 43
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        train, test = fs.Split(random_state=False, quiet=True)
        train.Normalize(inplace=True, quiet=True)
        fw = FisherFeatureWeights.NewFromFeatureSpace(train).Threshold()

        train.FeatureReduce(fw, inplace=True)
        test.FeatureReduce(fw, inplace=True,
                           quiet=True).Normalize(train,
                                                 inplace=True,
                                                 quiet=True)

        result = FeatureSpaceClassification.NewWND5(train, test, fw)
        result.Print()

        for class_name in result.test_set.class_names:
            try:
                self.assertEqual(
                    result.similarity_matrix[class_name][class_name], float(1))
            except:
                print "offending class: {0}, val: {1}".format(
                    class_name,
                    result.similarity_matrix[class_name][class_name])
                raise
Exemplo n.º 12
0
    def test_PerSampleStatisticsWITHPredictedValue(self):
        """DISCRETE PerSampleStatistics with numeric predicted value"""

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics WITH Pred Values"
        fs_kwargs['n_samples'] = n_samples = 40
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = True
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        # Use case 1: Straight, classic WND-CHARM train/test splits
        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit WITH Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        # Use case 2: Put LDA in pipeline (no fisher feature space prefilter, by default)
        ss_kwargs['lda'] = True
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        ## Use case 3: LDA AND Fisher feature space prefilter
        #ss_kwargs['pre_lda_feature_filter'] = True
        #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs )

        ## Use case 4: LDA AND Fisher feature space prefilter, AND post-LDA dimension reduction
        #ss_kwargs['lda_features_size'] = 0.5
        #exp = FeatureSpaceClassificationExperiment.NewShuffleSplit( fs, **ss_kwargs )

        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
    def test_PerSampleStatisticsWITHOUTPredictedValue(self):
        """DISCRETE ShuffleSplit/PerSampleStatistics w/ no predicted value"""

        # CAN'T USE THIS, SINCE THE CLASS NAMES ARE INTERPOLATABLE
        # 2-class, 10 samples per class
        #fs = FeatureSet_Discrete.NewFromFitFile( '../wndchrm_tests/test-l.fit' )

        fs_kwargs = {}
        fs_kwargs['name'] = "DISCRETE PerSampleStatistics No Pred Values"
        fs_kwargs['n_samples'] = n_samples = 20
        fs_kwargs['n_classes'] = 2
        fs_kwargs[
            'num_features_per_signal_type'] = 10  # small on purpose, to make test fast
        fs_kwargs['noise_gradient'] = 50
        fs_kwargs['initial_noise_sigma'] = 75
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['random_state'] = 42
        fs_kwargs['interpolatable'] = False
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False
        fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs[
            'name'] = "Discrete PerSampleStatistics ShuffleSplit No Pred Values"
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 1
        ss_kwargs['train_size'] = train_size = 8  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)

        ss_kwargs['lda'] = True
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            fs, **ss_kwargs)
        #Print calls self.GenereateStats()
        #from os import devnull
        exp.Print()  #output_stream=devnull )
        exp.PerSampleStatistics()  #output_stream=devnull )
        self.assertTrue(True)
Exemplo n.º 14
0
    def test_HyperparameterOptimizationGraph(self):
        """Accuracy vs. # features or samples with and without LDA feature space transform"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100  # smaller
        fs_kwargs['n_classes'] = 5  # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10  # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['quiet'] = False
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        ss_kwargs['show_raw'] = True
        ss_kwargs['show_lda'] = True
        ss_kwargs['param'] = 'features'
        ss_kwargs['text_angle'] = -30

        graph = HyperparameterOptimizationGraph(small_fs)
        graph.GridSearch(**ss_kwargs)
        #graph.savefig( '/Users/colettace/test_features.png' )

        ss_kwargs['param'] = 'samples'
        ss_kwargs['quiet'] = False
        ss_kwargs['text_angle'] = -30
        graph = HyperparameterOptimizationGraph(small_fs)
        graph.GridSearch(**ss_kwargs)
Exemplo n.º 15
0
    def test_SampleReduce( self ):
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Discrete

        n_classes = 10
        #========================================================
        # Section 1: LEAVE IN, Untiled Discrete (w/ classes) FeatureSpace instances
        fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1, interpolatable=True)

        # Reduce to 9 classes from 10, one sample per class
        # Drop the last class:
        desired = range(50, 950, 100)

        A = fs_discrete.SampleReduce( desired )
        # Further reduce to 8 classes
        A.RemoveClass( "FakeClass-055.6", inplace=True )

        correct_samplenames = ['FakeClass-100.0_050', 'FakeClass-077.8_050', 'FakeClass-033.3_050', 'FakeClass-011.1_050', 'FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050']
        #The actual alphanumeric sort order is different from the value sort order
        #correct_samplenames = ['FakeClass+011.1_050', 'FakeClass+033.3_050', 'FakeClass+055.6_050', 'FakeClass+077.8_050', 'FakeClass-011.1_050', 'FakeClass-033.3_050', 'FakeClass-077.8_050', 'FakeClass-100.0_050']
        self.assertEqual( correct_samplenames, A._contiguous_sample_names )

        correct_classnames = ['FakeClass-100.0', 'FakeClass-077.8', 'FakeClass-033.3', 'FakeClass-011.1',  'FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8'  ]
        #correct_classnames = ['FakeClass+011.1', 'FakeClass+033.3', 'FakeClass+055.6', 'FakeClass+077.8', 'FakeClass-011.1', 'FakeClass-033.3', 'FakeClass-077.8', 'FakeClass-100.0']
        self.assertEqual( correct_classnames, A.class_names )
        del A

        #========================================================
        # Section 2: LEAVE OUT, UNTiled Feature sets, Discrete FeatureSpace instances

        UNdesired = range(50, 950, 100)
        C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( C.num_samples, fs_discrete.num_samples - len( UNdesired ) )

        # Single integers for leave_out_list is ok
        UNdesired = 50
        C = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( C.num_samples, fs_discrete.num_samples - 1 )
        del C

        #========================================================
        # Section 3: LEAVE IN, Tiled Feature sets, Discrete FeatureSpace instances
        num_tiles = 4
        fs_discrete = CreateArtificialFeatureSpace_Discrete( n_samples=1000, n_classes=n_classes,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=num_tiles, interpolatable=True)

        desired = range(5, 95, 10) # Rearrange into 9 classes
        D = fs_discrete.SampleReduce( desired )
        # Total num samples should be 9 classes, 1 sample group per class, 4 tiles per SG = 36
        self.assertEqual( num_tiles * len( desired ), D.num_samples )
        del D

        #========================================================
        # Section 4: LEAVE OUT, WITH Tiled Feature sets, Discrete FeatureSpace instances

        # You can't leave out a sample group that doesn't exist
        UNdesired = range(50000, 50010)
        self.assertRaises( ValueError, fs_discrete.SampleReduce,
                leave_out_sample_group_ids=UNdesired )

        # Can't leave out trash
        UNdesired = ['foo', 'bar']
        self.assertRaises( TypeError, fs_discrete.SampleReduce,
                leave_out_sample_group_ids=UNdesired )

        # This input is ok:
        UNdesired = range(5, 95, 10)
        E = fs_discrete.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( E.num_samples, fs_discrete.num_samples - len( UNdesired ) * num_tiles )
        del E

        #========================================================
        # Section 5: LEAVE IN, Untiled Continuous FeatureSpace instances
        from wndcharm.ArtificialFeatureSpace import CreateArtificialFeatureSpace_Continuous

        fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=1)

        # dummyproof
        desired = ['foo', 'bar']
        self.assertRaises( TypeError, fs_cont.SampleReduce, desired )

        desired = range(50, 950)
        F = fs_cont.SampleReduce( desired )
        self.assertEqual( F.num_samples, len(desired) )
        del F

        #========================================================
        # Section 6: LEAVE OUT, Untiled Continuous FeatureSpace instances

        UNdesired = range(50, 950)
        G = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( G.num_samples, fs_cont.num_samples - len(UNdesired) )
        del G

        # single int is ok
        H = fs_cont.SampleReduce( leave_out_sample_group_ids=998 )
        self.assertEqual( H.num_samples, fs_cont.num_samples - 1 )
        del H

        #========================================================
        # Section 7: LEAVE IN, TILED Continuous FeatureSpace instances

        fs_cont = CreateArtificialFeatureSpace_Continuous( n_samples=1000,
                num_features_per_signal_type=30, noise_gradient=5, initial_noise_sigma=10,
                n_samples_per_group=num_tiles)

        desired = range(50, 95)
        I = fs_cont.SampleReduce( desired )
        self.assertEqual( I.num_samples, len(desired) * num_tiles )
        del I

        # single int is ok, ALTHOUGH NOT SURE WHY YOU'D EVER WANT A FS WITH A SINGLE SAMPLE
        J = fs_cont.SampleReduce( 98 )
        self.assertEqual( J.num_samples, num_tiles )
        del J

        #========================================================
        # Section 8: LEAVE OUT, TILED Continuous FeatureSpace instances

        UNdesired = range(50, 95)
        K = fs_cont.SampleReduce( leave_out_sample_group_ids=UNdesired )
        self.assertEqual( K.num_samples, fs_cont.num_samples - len(UNdesired) * num_tiles )
        del K

        # single int is ok
        L = fs_cont.SampleReduce( leave_out_sample_group_ids=98 )
        self.assertEqual( L.num_samples, fs_cont.num_samples - num_tiles  )
        del L
Exemplo n.º 16
0
class TestGraphs(unittest.TestCase):
    """Test WND-CHARM's graph-making functionality."""

    fs_kwargs = {}
    fs_kwargs['name'] = "DiscreteArtificialFS 10-class"
    fs_kwargs['n_samples'] = 1000
    fs_kwargs['n_classes'] = 10
    fs_kwargs['num_features_per_signal_type'] = 25
    fs_kwargs['initial_noise_sigma'] = 40
    fs_kwargs['noise_gradient'] = 20
    fs_kwargs['n_samples_per_group'] = 1
    fs_kwargs['interpolatable'] = True
    fs_kwargs['random_state'] = 43
    fs_kwargs['singularity'] = False
    fs_kwargs['clip'] = False

    fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

    train_set, test_set = fs.Split(random_state=False, quiet=True)
    train_set.Normalize(quiet=True)
    fw = FisherFeatureWeights.NewFromFeatureSpace(train_set).Threshold()

    reduced_train_set = train_set.FeatureReduce(fw)
    reduced_test_set = test_set.FeatureReduce(fw)
    reduced_test_set.Normalize(reduced_train_set, quiet=True)

    batch_result = FeatureSpaceClassification.NewWND5(reduced_train_set,
                                                      reduced_test_set,
                                                      fw,
                                                      quiet=True)

    def setUp(self):
        self.tempdir = mkdtemp()

    def tearDown(self):
        rmtree(self.tempdir)

    def CompareGraphs(self, graph, testfilename):
        """Helper function to check output graphs"""

        # Uncoment to see what graph looks like!
        #graph.SaveToFile( testfilename + 'GRAPH.png' )

        # We used to output the graphs to a png file and do a binary diff on a reference png
        # but there are superficial differences between matplotlib versions that result in
        # the points still being in the right place, but the font is slightly larger,
        # or the text is subtlely offset. So now, we interrogate the matplotlib.figure
        # object and retrieve its coordinates and check them against blessed numpy arrays
        # saved to a npy file.

        axessubplot = graph.figure.gca()

        if len(axessubplot.lines) > 0:
            # line plot
            try:
                all_coords = np.dstack(
                    tuple([
                        group._path._vertices for group in axessubplot.lines
                    ]))
            except AttributeError:
                # older version of matplotlib didn't include leading underscore in attribute
                # "_vertices"
                all_coords = np.dstack(
                    tuple(
                        [group._path.vertices for group in axessubplot.lines]))
        elif len(axessubplot.collections) > 0:
            # scatter plot
            all_coords = np.dstack(
                tuple([group._offsets for group in axessubplot.collections]))
        else:
            self.fail("Graph doesn't have any lines nor points")

        # uncomment to replace old coords
        #np.save( testfilename, all_coords )
        #from os.path import splitext
        #testfilename_base, ext = splitext( testfilename )
        #np.save( testfilename_base + 'NEW.npy', all_coords )
        reference_array = np.load(testfilename)

        if not np.array_equal(all_coords, reference_array):
            if not np.allclose(all_coords, reference_array):
                errmsg = 'Reference graph "{0}" coordinates '.format(testfilename) + \
                    'do not concur with coordinates generated by this test.'
                self.fail(errmsg)

    @unittest.skipIf(HasMatplotlib, "Skipped if matplotlib IS installed")
    def test_ErrMsgIfMatplotibNotInstalled(self):
        """Fail gracefully with informative message if matplotlib"""

        graph = PredictedValuesGraph(self.batch_result)
        with self.assertRaises(ImportError):
            graph.RankOrderedPredictedValuesGraph()
        with self.assertRaises(ImportError):
            graph.KernelSmoothedDensityGraph()

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOT installed")
    @unittest.expectedFailure
    def test_RankOrderedFromBatchClassificationResult(self):
        """Rank Ordered Predicted values graph from a single split"""

        testfilename = 'test_graph_rank_ordered_interpolated_discrete.npy'
        graph = PredictedValuesGraph(self.batch_result)
        graph.RankOrderedPredictedValuesGraph()
        self.CompareGraphs(graph, testfilename)

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOT installed")
    @unittest.expectedFailure
    def test_KernelSmoothedFromBatchClassificationResult(self):
        """Kernel Smoothed Probability density graph from a single split"""

        testfilename = 'test_graph_kernel_smoothed.npy'
        graph = PredictedValuesGraph(self.batch_result)
        graph.KernelSmoothedDensityGraph()
        self.CompareGraphs(graph, testfilename)

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOT installed")
    def test_FromDiscreteClassificationExperimentResults(self):
        """Rank Ordered Predicted values graph from an experiment result (multiple splits)"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100  # smaller
        fs_kwargs['n_classes'] = 5  # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10  # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['quiet'] = True
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        exp = FeatureSpaceClassificationExperiment.NewShuffleSplit(
            small_fs, **ss_kwargs)
        graph = PredictedValuesGraph(exp, use_averaged_results=False)
        graph.RankOrderedPredictedValuesGraph()
        self.CompareGraphs(graph, testfilename)

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOT installed")
    def test_HyperparameterOptimizationGraph(self):
        """Accuracy vs. # features or samples with and without LDA feature space transform"""

        testfilename = 'test_graph_rank_ordered_experiment.npy'

        # Make a smaller featureset to do multiple splits
        fs_kwargs = {}
        fs_kwargs['name'] = "DiscreteArtificialFS RANK ORDERED SHUFFLE SPLIT"
        fs_kwargs['n_samples'] = 100  # smaller
        fs_kwargs['n_classes'] = 5  # smaller, 20 samples per class
        fs_kwargs['num_features_per_signal_type'] = 10  # smaller
        fs_kwargs['initial_noise_sigma'] = 50
        fs_kwargs['noise_gradient'] = 20
        fs_kwargs['n_samples_per_group'] = 1
        fs_kwargs['interpolatable'] = True
        fs_kwargs['random_state'] = 42
        fs_kwargs['singularity'] = False
        fs_kwargs['clip'] = False

        small_fs = CreateArtificialFeatureSpace_Discrete(**fs_kwargs)

        ss_kwargs = {}
        ss_kwargs['quiet'] = False
        ss_kwargs['n_iter'] = n_iter = 10
        ss_kwargs['train_size'] = train_size = 18  # per-class
        ss_kwargs['test_size'] = test_size = 2  # per-class
        ss_kwargs['random_state'] = 42
        ss_kwargs['show_raw'] = True
        ss_kwargs['show_lda'] = True
        ss_kwargs['param'] = 'features'
        ss_kwargs['text_angle'] = -30

        graph = HyperparameterOptimizationGraph(small_fs)
        graph.GridSearch(**ss_kwargs)
        #graph.savefig( '/Users/colettace/test_features.png' )

        ss_kwargs['param'] = 'samples'
        ss_kwargs['quiet'] = False
        ss_kwargs['text_angle'] = -30
        graph = HyperparameterOptimizationGraph(small_fs)
        graph.GridSearch(**ss_kwargs)
        #graph.savefig( '/Users/colettace/test_samples.png' )

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOT installed")
    def test_FromHTML(self):
        """Rank Ordered Predicted values graph from an experiment result (multiple splits)"""

        testfilename = 'test_graph_fromHTML.npy'
        # Inflate the zipped html file into a temp file
        import zipfile

        #zipped_file_path = pychrm_test_dir + sep + 'c_elegans_terminal_bulb.html'
        #import zlib
        #zf = zipfile.ZipFile( zipped_file_path + '.zip', mode='w' )
        #zf.write( zipped_file_path, compress_type=zipfile.ZIP_DEFLATED )
        #zf.close()

        zipped_file_path = pychrm_test_dir + sep + 'c_elegans_terminal_bulb.html.zip'
        zf = zipfile.ZipFile(zipped_file_path, mode='r')
        zf.extractall(self.tempdir)
        htmlfilepath = self.tempdir + sep + zf.namelist()[0]
        graph = PredictedValuesGraph.NewFromHTMLReport(
            htmlfilepath, use_averaged_results=False)
        graph.RankOrderedPredictedValuesGraph()

        self.CompareGraphs(graph, testfilename)

    @unittest.skipUnless(HasMatplotlib,
                         "Skipped if matplotlib IS NOTinstalled")
    def test_IfNotInterpolatable(self):
        """You can't graph predicted values if the classes aren't interpolatable."""

        testfilename = 'ShouldntBeGraphable.png'
        small_fs = CreateArtificialFeatureSpace_Discrete(n_samples=20,
                                                         n_classes=2,
                                                         random_state=42,
                                                         interpolatable=False)
        train_set, test_set = small_fs.Split(random_state=False, quiet=True)
        train_set.Normalize()

        fw = FisherFeatureWeights.NewFromFeatureSpace(train_set).Threshold()
        reduced_train_set = train_set.FeatureReduce(fw)
        reduced_test_set = test_set.FeatureReduce(fw)
        test_set.Normalize(train_set, quiet=True)

        batch_result = FeatureSpaceClassification.NewWND5(reduced_train_set,
                                                          reduced_test_set,
                                                          fw,
                                                          quiet=True)
        with self.assertRaises(ValueError):
            graph = PredictedValuesGraph(batch_result)