def describe_cols(M, verbose=True): """Returns summary statistics for a numpy array Parameters ---------- M : numpy.ndarray structured array Returns ------- numpy.ndarray structured array of summary statistics for M """ M = utils.check_sa(M) descr_rows = [] for col_name, col_type in M.dtype.descr: if 'f' in col_type or 'i' in col_type: col = M[col_name] row = [col_name ] + [func(col) for _, func in __describe_cols_metrics] else: row = [col_name] + __describe_cols_fill descr_rows.append(row) col_names = ['Column Name' ] + [col_name for col_name, _ in __describe_cols_metrics] ret = convert_to_sa(descr_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def describe_cols(M, verbose=True): """Returns summary statistics for a numpy array Parameters ---------- M : numpy.ndarray structured array Returns ------- numpy.ndarray structured array of summary statistics for M """ M = utils.check_sa(M) descr_rows = [] for col_name, col_type in M.dtype.descr: if 'f' in col_type or 'i' in col_type: col = M[col_name] row = [col_name] + [func(col) for _, func in __describe_cols_metrics] else: row = [col_name] + __describe_cols_fill descr_rows.append(row) col_names = ['Column Name'] + [col_name for col_name, _ in __describe_cols_metrics] ret = convert_to_sa(descr_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def test_plot_correlation_scatter_plot(self): col1 = range(10) col2 = [cell * 3 + 1 for cell in col1] col3 = [1, 5, 8, 4, 1, 8, 5, 9, 0, 1] sa = utils.convert_to_sa( zip(col1, col2, col3), col_names=['base', 'linear_trans', 'no_correlation']) fig = dsp.plot_correlation_scatter_plot(sa, verbose=False) self.add_fig_to_report(fig, 'plot_correlation_scatter_plot')
def test_convert_to_sa(self): # already a structured array sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)), (2, 2.0, 'b', datetime(2016, 01, 01))], dtype=[('int', int), ('float', float), ('str', 'O'), ('date', 'M8[s]')]) self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa))) # homogeneous array no col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('f0', int), ('f1', int), ('f2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd))) # homogeneous array with col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('i0', int), ('i1', int), ('i2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa( nd, col_names=['i0', 'i1', 'i2']))) # list of lists no col name provided lol = [[1, 1, None], ['abc', 2, 3.4]] ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)], dtype=[('f0', 'S3'), ('f1', int), ('f2', float)]) res = utils.convert_to_sa(lol) self.assertTrue(uft.array_equal(ctrl, res)) # list of lists with col name provided lol = [['hello', 1.2, datetime(2012, 1, 1), None], [1.3, np.nan, None, '2013-01-01'], [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']] ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME), ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)), ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)], dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'), ('i3', 'M8[us]')]) res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3']) self.assertTrue(uft.array_equal(ctrl, res))
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True): """Gets the top features for a fitted clf Parameters ---------- clf : sklearn.base.BaseEstimator Fitted classifier with a feature_importances_ attribute M : numpy.ndarray or None Structured array corresponding to fitted clf. Used here to deterimine column names col_names : list of str or None List of column names corresponding to fitted clf. n : int Number of features to return verbose : boolean iff True, prints ranked features Returns ------- numpy.ndarray structured array with top feature names and scores """ if not isinstance(clf, BaseEstimator): raise ValueError('clf must be an instance of sklearn.Base.BaseEstimator') scores = clf.feature_importances_ if col_names is None: if is_sa(M): col_names = M.dtype.names else: col_names = ['f{}'.format(i) for i in xrange(len(scores))] else: col_names = utils.check_col_names(col_names, n_cols = scores.shape[0]) ranked_name_and_score = [(col_names[x], scores[x]) for x in scores.argsort()[::-1]] ranked_name_and_score = convert_to_sa( ranked_name_and_score[:n], col_names=('feat_name', 'score')) if verbose: pprint_sa(ranked_name_and_score) return ranked_name_and_score
def crosstab(col1, col2, verbose=True): """ Makes a crosstab of col1 and col2. This is represented as a structured array with the following properties: 1. The first column is the value of col1 being crossed 2. The name of every column except the first is the value of col2 being crossed 3. To find the number of cooccurences of x from col1 and y in col2, find the row that has 'x' in col1 and the column named 'y'. The corresponding cell is the number of cooccurrences of x and y Parameters ---------- col1 : np.ndarray col2 : np.ndarray Returns ------- np.ndarray structured array """ col1 = utils.check_col(col1, argument_name='col1') col2 = utils.check_col(col2, argument_name='col2') col1_unique = np.unique(col1) col2_unique = np.unique(col2) crosstab_rows = [] for col1_val in col1_unique: loc_col1_val = np.where(col1 == col1_val)[0] col2_vals = col2[loc_col1_val] cnt = Counter(col2_vals) counts = [ cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val in col2_unique ] crosstab_rows.append(['{}'.format(col1_val)] + counts) col_names = ['col1_value' ] + ['{}'.format(col2_val) for col2_val in col2_unique] ret = convert_to_sa(crosstab_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True): """Gets the top features for a fitted clf Parameters ---------- clf : sklearn.base.BaseEstimator Fitted classifier with a feature_importances_ attribute M : numpy.ndarray or None Structured array corresponding to fitted clf. Used here to deterimine column names col_names : list of str or None List of column names corresponding to fitted clf. n : int Number of features to return verbose : boolean iff True, prints ranked features Returns ------- numpy.ndarray structured array with top feature names and scores """ if not isinstance(clf, BaseEstimator): raise ValueError( 'clf must be an instance of sklearn.Base.BaseEstimator') scores = clf.feature_importances_ if col_names is None: if is_sa(M): col_names = M.dtype.names else: col_names = ['f{}'.format(i) for i in xrange(len(scores))] else: col_names = utils.check_col_names(col_names, n_cols=scores.shape[0]) ranked_name_and_score = [(col_names[x], scores[x]) for x in scores.argsort()[::-1]] ranked_name_and_score = convert_to_sa(ranked_name_and_score[:n], col_names=('feat_name', 'score')) if verbose: pprint_sa(ranked_name_and_score) return ranked_name_and_score
def crosstab(col1, col2, verbose=True): """ Makes a crosstab of col1 and col2. This is represented as a structured array with the following properties: 1. The first column is the value of col1 being crossed 2. The name of every column except the first is the value of col2 being crossed 3. To find the number of cooccurences of x from col1 and y in col2, find the row that has 'x' in col1 and the column named 'y'. The corresponding cell is the number of cooccurrences of x and y Parameters ---------- col1 : np.ndarray col2 : np.ndarray Returns ------- np.ndarray structured array """ col1 = utils.check_col(col1, argument_name='col1') col2 = utils.check_col(col2, argument_name='col2') col1_unique = np.unique(col1) col2_unique = np.unique(col2) crosstab_rows = [] for col1_val in col1_unique: loc_col1_val = np.where(col1==col1_val)[0] col2_vals = col2[loc_col1_val] cnt = Counter(col2_vals) counts = [cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val in col2_unique] crosstab_rows.append(['{}'.format(col1_val)] + counts) col_names = ['col1_value'] + ['{}'.format(col2_val) for col2_val in col2_unique] ret = convert_to_sa(crosstab_rows, col_names=col_names) if verbose: pprint_sa(ret) return ret
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) ctrl_feat_importances = clf.feature_importances_ ctrl_col_names = ['f{}'.format(i) for i in xrange(15)] ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10] ctrl = utils.convert_to_sa( zip(ctrl_col_names, ctrl_feat_importances), col_names=('feat_name', 'score'))[ctrl_feat_ranks] res = dsp.get_top_features(clf, M, verbose=False) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))
def table(col, verbose=True): """ Creates a summary or the number of occurrences of each value in the column Similar to R's table Parameters ---------- col :np.ndarray Returns ------- np.ndarray structured array """ col = utils.check_col(col) cnt = Counter(col) cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0]) ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count')) if verbose: pprint_sa(ret) return ret
def table(col, verbose=True): """ Creates a summary or the number of occurrences of each value in the column Similar to R's table Parameters ---------- col :np.ndarray Returns ------- np.ndarray structured array """ col = utils.check_col(col) cnt = Counter(col) cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0]) ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count')) if verbose: pprint_sa(ret) return ret
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = dsp.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa( [('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))