def test_plot_correlation_scatter_plot(self): col1 = range(10) col2 = [cell * 3 + 1 for cell in col1] col3 = [1, 5, 8, 4, 1, 8, 5, 9, 0, 1] sa = utils.convert_to_sa( zip(col1, col2, col3), col_names=['base', 'linear_trans', 'no_correlation']) fig = comm.plot_correlation_scatter_plot(sa, verbose=False) self.add_fig_to_report(fig, 'plot_correlation_scatter_plot')
def test_convert_to_sa(self): # already a structured array sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)), (2, 2.0, 'b', datetime(2016, 01, 01))], dtype=[('int', int), ('float', float), ('str', 'S1'), ('date', 'M8[s]')]) self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa))) # homogeneous array no col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('f0', int), ('f1', int), ('f2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd))) # homogeneous array with col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('i0', int), ('i1', int), ('i2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa( nd, col_names=['i0', 'i1', 'i2']))) # list of lists no col name provided lol = [[1, 1, None], ['abc', 2, 3.4]] ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)], dtype=[('f0', 'S3'), ('f1', int), ('f2', float)]) res = utils.convert_to_sa(lol) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) # list of lists with col name provided lol = [['hello', 1.2, datetime(2012, 1, 1), None], [1.3, np.nan, None, '2013-01-01'], [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']] ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME), ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)), ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)], dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'), ('i3', 'M8[us]')]) res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3']) self.assertTrue(utils_for_tests.array_equal(ctrl, res))
def test_convert_to_sa(self): # already a structured array sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)), (2, 2.0, 'b', datetime(2016, 01, 01))], dtype=[('int', int), ('float', float), ('str', 'S1'), ('date', 'M8[s]')]) self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa))) # homogeneous array no col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('f0', int), ('f1', int), ('f2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd))) # homogeneous array with col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('i0', int), ('i1', int), ('i2', int)]) self.assertTrue( np.array_equal( ctrl, utils.convert_to_sa(nd, col_names=['i0', 'i1', 'i2']))) # list of lists no col name provided lol = [[1, 1, None], ['abc', 2, 3.4]] ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)], dtype=[('f0', 'S3'), ('f1', int), ('f2', float)]) res = utils.convert_to_sa(lol) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) # list of lists with col name provided lol = [['hello', 1.2, datetime(2012, 1, 1), None], [1.3, np.nan, None, '2013-01-01'], [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']] ctrl = np.array( [('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME), ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)), ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)], dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'), ('i3', 'M8[us]')]) res = utils.convert_to_sa(lol, col_names=['i0', 'i1', 'i2', 'i3']) self.assertTrue(utils_for_tests.array_equal(ctrl, res))
def label_encode(M): """ Changes string cols to integers so that there is a 1-1 mapping between strings and ints """ M = convert_to_sa(M) le = preprocessing.LabelEncoder() new_dtype = [] result_arrays = [] for (col_name, fmt) in M.dtype.descr: if 'S' in fmt: result_arrays.append(le.fit_transform(M[col_name])) new_dtype.append((col_name, int)) else: result_arrays.append(M[col_name]) new_dtype.append((col_name, fmt)) return np.array(zip(*result_arrays), dtype=new_dtype)
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = comm.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa([('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res))
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = comm.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa( [('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res))
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0): # TODO support times, strings M = convert_to_sa(M) if strategy not in ['mean', 'median', 'most_frequent', 'constant']: raise ValueError('Invalid strategy') M_cp = M.copy() if strategy == 'constant': try: missing_is_nan = np.isnan(missing_val) except TypeError: # missing_val is not a float missing_is_nan = False if missing_is_nan: # we need to be careful about handling nan for col_name, col_type in M_cp.dtype.descr: if 'f' in col_type: col = M_cp[col_name] col[np.isnan(col)] = constant return M_cp for col_name, col_type in M_cp.dtype.descr: if 'i' in col_type or 'f' in col_type: col = M_cp[col_name] col[col == missing_val] = constant return M_cp # we're doing one of the sklearn imputer strategies imp = Imputer(missing_values=missing_val, strategy=strategy, axis=1) for col_name, col_type in M_cp.dtype.descr: if 'f' in col_type or 'i' in col_type: # The Imputer only works on float and int columns col = M_cp[col_name] col[:] = imp.fit_transform(col) return M_cp