def test_retrieve_nonexistent_file(self): fake_file = 'nonexistentfile.data' r = Retriever(fake_file) with self.assertRaises(errors.RetrieverError): r.load()
def test_split_target(self, data, target_column, expected_data, expected_target): r = Retriever('nonexistentfile.data') r._data = np.array(data) actual_data, actual_target = r.split_target(target_column).retrieve() testing.assert_array_equal(actual_data, np.array(expected_data)) testing.assert_array_equal(expected_target, np.array(expected_target))
def _load_data(self): r = Retriever('../../datasets/breast-cancer/wdbc.data', delimiter=',') r.split_column(0) # Remove ids. # Split target from data and retrieve both. # Target feature is actually located in the 2nd column, but considering # we had the ids removed, it's now in the 1st one. self.data, self.target = r.split_target(0).retrieve() self.data = self.data.astype(float) self.feature_names = ['radius', 'texture', 'perimeter']
def _load_data(self): r = Retriever('../../datasets/diabetes/pima-indians-diabetes.data', delimiter=',') self.data, self.target = r.split_target().retrieve() self.feature_names = [ 'Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure (mm Hg)' ] self.displayer.colors = [plt.cm.viridis]
def test_retrieve_brainwave(self): retriever = Retriever('datasets/brainwave/plrx.txt', target_column=-1) data, target = retriever.load().split_target().retrieve() # Assert number of samples. self.assertEqual(182, len(data)) # Assert number of features. self.assertEqual(12, len(data[1])) # Assert number of samples in target array. self.assertEqual(182, len(target))
def test_display_dimensions(self): data_dir = 'datasets/' data_set = 'glass/glass.data' file = os.path.join(data_dir, data_set) print('Displaying data set {%s} in the Rn' % file) glass = Retriever(file, delimiter=',') # Glass has the samples' ids in the first column. glass.split_column(0) # Additionally, its last column represents the target feature. glass.split_target() data, c = glass.retrieve() reduced_data = algorithms.Isomap(data, e=20).run() d = Displayer(title=data_set) # Scatter all dimensions (3-by-3), using as many graphs as necessary. for begin in range(0, glass.features_count, 3): end = min(glass.features_count, begin + 3) d.load(data[:, begin:end], color=c, title='Dimensions: d e [%i, %i]' % (begin + 1, end)) d \ .load('Reduced glass data-set', reduced_data, c) \ .show()
def load_data(self): self.data, self.target = Retriever( self.file, delimiter=',').split_target().retrieve() self.original_data = self.data self.displayer \ .load(self.data[:, 1:4], self.target) \ .save('datasets/spam') \ .dispose() print('Data set size: %.2fKB' % (self.data.nbytes / 1024)) print('shape: %s' % str(self.data.shape))
def _run(self): # the data and marking that the data is delimited by commas. r = Retriever(self.data_set_file, delimiter=',') data, glass_type = r.split_target().retrieve() labels = [ '\nAge', '\nGender', '\nTotal Bilirubin (TB)', '\nDirect Bilirubin (DB)', '\nAlkaline Phosphotase (Alk.)', '\nAlamine Aminotransferase (Sgpt)', '\nAspartate Aminotransferase (Sgot)', '\nTotal Protiens (TP)', '\nAlbumin (ALB)', '\nRatio Albumin and Globulin Ratio (A/G)' ] for i in range(0, data.shape[1], 3): self.displayer.load(data[:, i:i + 3], glass_type, axis_labels=labels[i:i + 3]) print('Correlation matrix:') print(np.corrcoef(data, rowvar=0)) self.displayer.save('displaying_ilpd')
def _load_data(self): r = Retriever(self.file, delimiter=',') r.split_column(0) # Remove ids. self.data, self.target = r.split_target().retrieve() self.original_data = self.data