예제 #1
0
    def test_retrieve_nonexistent_file(self):
        fake_file = 'nonexistentfile.data'

        r = Retriever(fake_file)

        with self.assertRaises(errors.RetrieverError):
            r.load()
예제 #2
0
    def test_split_target(self, data, target_column, expected_data,
                          expected_target):
        r = Retriever('nonexistentfile.data')
        r._data = np.array(data)
        actual_data, actual_target = r.split_target(target_column).retrieve()

        testing.assert_array_equal(actual_data, np.array(expected_data))
        testing.assert_array_equal(expected_target, np.array(expected_target))
예제 #3
0
    def _load_data(self):
        r = Retriever('../../datasets/breast-cancer/wdbc.data', delimiter=',')
        r.split_column(0)  # Remove ids.

        # Split target from data and retrieve both.
        # Target feature is actually located in the 2nd column, but considering
        # we had the ids removed, it's now in the 1st one.
        self.data, self.target = r.split_target(0).retrieve()
        self.data = self.data.astype(float)
        self.feature_names = ['radius', 'texture', 'perimeter']
예제 #4
0
    def _load_data(self):
        r = Retriever('../../datasets/diabetes/pima-indians-diabetes.data',
                      delimiter=',')
        self.data, self.target = r.split_target().retrieve()
        self.feature_names = [
            'Number of times pregnant', 'Plasma glucose concentration',
            'Diastolic blood pressure (mm Hg)'
        ]

        self.displayer.colors = [plt.cm.viridis]
예제 #5
0
    def test_retrieve_brainwave(self):
        retriever = Retriever('datasets/brainwave/plrx.txt', target_column=-1)

        data, target = retriever.load().split_target().retrieve()

        # Assert number of samples.
        self.assertEqual(182, len(data))

        # Assert number of features.
        self.assertEqual(12, len(data[1]))

        # Assert number of samples in target array.
        self.assertEqual(182, len(target))
예제 #6
0
    def test_display_dimensions(self):
        data_dir = 'datasets/'
        data_set = 'glass/glass.data'
        file = os.path.join(data_dir, data_set)

        print('Displaying data set {%s} in the Rn' % file)

        glass = Retriever(file, delimiter=',')

        # Glass has the samples' ids in the first column.
        glass.split_column(0)
        # Additionally, its last column represents the target feature.
        glass.split_target()

        data, c = glass.retrieve()
        reduced_data = algorithms.Isomap(data, e=20).run()

        d = Displayer(title=data_set)

        # Scatter all dimensions (3-by-3), using as many graphs as necessary.
        for begin in range(0, glass.features_count, 3):
            end = min(glass.features_count, begin + 3)
            d.load(data[:, begin:end],
                   color=c,
                   title='Dimensions: d e [%i, %i]' % (begin + 1, end))

        d \
            .load('Reduced glass data-set', reduced_data, c) \
            .show()
예제 #7
0
    def load_data(self):
        self.data, self.target = Retriever(
            self.file, delimiter=',').split_target().retrieve()
        self.original_data = self.data

        self.displayer \
            .load(self.data[:, 1:4], self.target) \
            .save('datasets/spam') \
            .dispose()

        print('Data set size: %.2fKB' % (self.data.nbytes / 1024))
        print('shape: %s' % str(self.data.shape))
예제 #8
0
    def _run(self):
        # the data and marking that the data is delimited by commas.
        r = Retriever(self.data_set_file, delimiter=',')

        data, glass_type = r.split_target().retrieve()

        labels = [
            '\nAge', '\nGender', '\nTotal Bilirubin (TB)',
            '\nDirect Bilirubin (DB)', '\nAlkaline Phosphotase (Alk.)',
            '\nAlamine Aminotransferase (Sgpt)',
            '\nAspartate Aminotransferase (Sgot)', '\nTotal Protiens (TP)',
            '\nAlbumin (ALB)', '\nRatio Albumin and Globulin Ratio (A/G)'
        ]

        for i in range(0, data.shape[1], 3):
            self.displayer.load(data[:, i:i + 3],
                                glass_type,
                                axis_labels=labels[i:i + 3])

        print('Correlation matrix:')
        print(np.corrcoef(data, rowvar=0))

        self.displayer.save('displaying_ilpd')
예제 #9
0
 def _load_data(self):
     r = Retriever(self.file, delimiter=',')
     r.split_column(0)  # Remove ids.
     self.data, self.target = r.split_target().retrieve()
     self.original_data = self.data