示例#1
0
def describe_cols(M, verbose=True):
    """Returns summary statistics for a numpy array

    Parameters
    ----------
    M : numpy.ndarray
        structured array
       
    Returns
    -------
    numpy.ndarray
        structured array of summary statistics for M
       
    """
    M = utils.check_sa(M)
    descr_rows = []
    for col_name, col_type in M.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            col = M[col_name]
            row = [col_name
                   ] + [func(col) for _, func in __describe_cols_metrics]
        else:
            row = [col_name] + __describe_cols_fill
        descr_rows.append(row)
    col_names = ['Column Name'
                 ] + [col_name for col_name, _ in __describe_cols_metrics]
    ret = convert_to_sa(descr_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#2
0
def describe_cols(M, verbose=True):
    """Returns summary statistics for a numpy array

    Parameters
    ----------
    M : numpy.ndarray
        structured array
       
    Returns
    -------
    numpy.ndarray
        structured array of summary statistics for M
       
    """ 
    M = utils.check_sa(M)           
    descr_rows = []
    for col_name, col_type in M.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            col = M[col_name]
            row = [col_name] + [func(col) for _, func in 
                                __describe_cols_metrics]
        else:
            row = [col_name] + __describe_cols_fill
        descr_rows.append(row)
    col_names = ['Column Name'] + [col_name for col_name, _ in 
                                   __describe_cols_metrics]
    ret = convert_to_sa(descr_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#3
0
 def test_plot_correlation_scatter_plot(self):
     col1 = range(10)
     col2 = [cell * 3 + 1 for cell in col1]
     col3 = [1, 5, 8, 4, 1, 8, 5, 9, 0, 1]
     sa = utils.convert_to_sa(
             zip(col1, col2, col3), 
             col_names=['base', 'linear_trans', 'no_correlation'])
     fig = dsp.plot_correlation_scatter_plot(sa, verbose=False)
     self.add_fig_to_report(fig, 'plot_correlation_scatter_plot')
示例#4
0
    def test_convert_to_sa(self):
        # already a structured array
        sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)),
                       (2, 2.0, 'b', datetime(2016, 01, 01))],
                      dtype=[('int', int), ('float', float), ('str', 'O'),
                             ('date', 'M8[s]')])
        self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa)))

        # homogeneous array no col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('f0', int), ('f1', int), ('f2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd)))

        # homogeneous array with col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('i0', int), ('i1', int), ('i2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(
            nd,
            col_names=['i0', 'i1', 'i2'])))

        # list of lists no col name provided
        lol = [[1, 1, None],
               ['abc', 2, 3.4]]
        ctrl = np.array([('1', 1, np.nan),
                         ('abc', 2, 3.4)],
                        dtype=[('f0', 'S3'), ('f1', int), ('f2', float)])
        res = utils.convert_to_sa(lol)
        self.assertTrue(uft.array_equal(ctrl, res))

        # list of lists with col name provided
        lol = [['hello', 1.2, datetime(2012, 1, 1), None],
               [1.3, np.nan, None, '2013-01-01'],
               [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']]
        ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME),
                         ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)),
                         ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)],
                        dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'),
                               ('i3', 'M8[us]')])
        res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3'])
        self.assertTrue(uft.array_equal(ctrl, res))
示例#5
0
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True):
    """Gets the top features for a fitted clf

    Parameters
    ----------
    clf : sklearn.base.BaseEstimator
        Fitted classifier with a feature_importances_ attribute
    M : numpy.ndarray or None
        Structured array corresponding to fitted clf. Used here to deterimine
        column names
    col_names : list of str or None
        List of column names corresponding to fitted clf.
    n : int
        Number of features to return
    verbose : boolean
        iff True, prints ranked features

    Returns
    -------
    numpy.ndarray
        structured array with top feature names and scores

    """
    if not isinstance(clf, BaseEstimator):
        raise ValueError('clf must be an instance of sklearn.Base.BaseEstimator')


    scores = clf.feature_importances_
    if col_names is None:
        if is_sa(M):
            col_names = M.dtype.names
        else:
            col_names = ['f{}'.format(i) for i in xrange(len(scores))]
    else:
        col_names = utils.check_col_names(col_names, n_cols = scores.shape[0])
    ranked_name_and_score = [(col_names[x], scores[x]) for x in 
                             scores.argsort()[::-1]]
    ranked_name_and_score = convert_to_sa(
            ranked_name_and_score[:n], 
            col_names=('feat_name', 'score'))
    if verbose:
        pprint_sa(ranked_name_and_score)
    return ranked_name_and_score
示例#6
0
def crosstab(col1, col2, verbose=True):
    """
    Makes a crosstab of col1 and col2. This is represented as a
    structured array with the following properties:

    1. The first column is the value of col1 being crossed
    2. The name of every column except the first is the value of col2 being
       crossed
    3. To find the number of cooccurences of x from col1 and y in col2,
       find the row that has 'x' in col1 and the column named 'y'. The 
       corresponding cell is the number of cooccurrences of x and y

    Parameters
    ----------
    col1 : np.ndarray
    col2 : np.ndarray

    Returns
    -------
    np.ndarray
        structured array

    """
    col1 = utils.check_col(col1, argument_name='col1')
    col2 = utils.check_col(col2, argument_name='col2')
    col1_unique = np.unique(col1)
    col2_unique = np.unique(col2)
    crosstab_rows = []
    for col1_val in col1_unique:
        loc_col1_val = np.where(col1 == col1_val)[0]
        col2_vals = col2[loc_col1_val]
        cnt = Counter(col2_vals)
        counts = [
            cnt[col2_val] if cnt.has_key(col2_val) else 0
            for col2_val in col2_unique
        ]
        crosstab_rows.append(['{}'.format(col1_val)] + counts)
    col_names = ['col1_value'
                 ] + ['{}'.format(col2_val) for col2_val in col2_unique]
    ret = convert_to_sa(crosstab_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#7
0
def get_top_features(clf, M=None, col_names=None, n=10, verbose=True):
    """Gets the top features for a fitted clf

    Parameters
    ----------
    clf : sklearn.base.BaseEstimator
        Fitted classifier with a feature_importances_ attribute
    M : numpy.ndarray or None
        Structured array corresponding to fitted clf. Used here to deterimine
        column names
    col_names : list of str or None
        List of column names corresponding to fitted clf.
    n : int
        Number of features to return
    verbose : boolean
        iff True, prints ranked features

    Returns
    -------
    numpy.ndarray
        structured array with top feature names and scores

    """
    if not isinstance(clf, BaseEstimator):
        raise ValueError(
            'clf must be an instance of sklearn.Base.BaseEstimator')

    scores = clf.feature_importances_
    if col_names is None:
        if is_sa(M):
            col_names = M.dtype.names
        else:
            col_names = ['f{}'.format(i) for i in xrange(len(scores))]
    else:
        col_names = utils.check_col_names(col_names, n_cols=scores.shape[0])
    ranked_name_and_score = [(col_names[x], scores[x])
                             for x in scores.argsort()[::-1]]
    ranked_name_and_score = convert_to_sa(ranked_name_and_score[:n],
                                          col_names=('feat_name', 'score'))
    if verbose:
        pprint_sa(ranked_name_and_score)
    return ranked_name_and_score
示例#8
0
def crosstab(col1, col2, verbose=True):
    """
    Makes a crosstab of col1 and col2. This is represented as a
    structured array with the following properties:

    1. The first column is the value of col1 being crossed
    2. The name of every column except the first is the value of col2 being
       crossed
    3. To find the number of cooccurences of x from col1 and y in col2,
       find the row that has 'x' in col1 and the column named 'y'. The 
       corresponding cell is the number of cooccurrences of x and y

    Parameters
    ----------
    col1 : np.ndarray
    col2 : np.ndarray

    Returns
    -------
    np.ndarray
        structured array

    """
    col1 = utils.check_col(col1, argument_name='col1')
    col2 = utils.check_col(col2, argument_name='col2')
    col1_unique = np.unique(col1)
    col2_unique = np.unique(col2)
    crosstab_rows = []
    for col1_val in col1_unique:
        loc_col1_val = np.where(col1==col1_val)[0]
        col2_vals = col2[loc_col1_val]
        cnt = Counter(col2_vals)
        counts = [cnt[col2_val] if cnt.has_key(col2_val) else 0 for col2_val 
                  in col2_unique]
        crosstab_rows.append(['{}'.format(col1_val)] + counts)
    col_names = ['col1_value'] + ['{}'.format(col2_val) for col2_val in 
                                  col2_unique]
    ret = convert_to_sa(crosstab_rows, col_names=col_names)
    if verbose:
        pprint_sa(ret)
    return ret
示例#9
0
    def test_get_top_features(self):
        M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
        M = utils.cast_np_sa_to_nd(M)
        M_train, M_test, labels_train, labels_test = train_test_split(
                M, 
                labels)
        clf = RandomForestClassifier(random_state=0)
        clf.fit(M_train, labels_train)

        ctrl_feat_importances = clf.feature_importances_
        ctrl_col_names = ['f{}'.format(i) for i in xrange(15)]
        ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10]
        ctrl = utils.convert_to_sa(
                zip(ctrl_col_names, ctrl_feat_importances),
                col_names=('feat_name', 'score'))[ctrl_feat_ranks]

        res = dsp.get_top_features(clf, M, verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

        res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))
示例#10
0
def table(col, verbose=True):
    """
    Creates a summary or the number of occurrences of each value in the column

    Similar to R's table

    Parameters
    ----------
    col :np.ndarray

    Returns
    -------
    np.ndarray
        structured array
    """
    col = utils.check_col(col)
    cnt = Counter(col)
    cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0])
    ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count'))
    if verbose:
        pprint_sa(ret)
    return ret
示例#11
0
def table(col, verbose=True):
    """
    Creates a summary or the number of occurrences of each value in the column

    Similar to R's table

    Parameters
    ----------
    col :np.ndarray

    Returns
    -------
    np.ndarray
        structured array
    """
    col = utils.check_col(col)
    cnt = Counter(col)
    cat_and_cnt = sorted(cnt.iteritems(), key=lambda item: item[0])
    ret = convert_to_sa(cat_and_cnt, col_names=('col_name', 'count'))
    if verbose:
        pprint_sa(ret)
    return ret
示例#12
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
             M, 
             labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = dsp.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa(
             [('f5',  0.0773838526068), 
              ('f13',   0.0769596713039),
              ('f8',  0.0751584839431),
              ('f6',  0.0730815879102),
              ('f11',   0.0684456133071),
              ('f9',  0.0666747414603),
              ('f10',   0.0659621889608),
              ('f7',  0.0657988099065),
              ('f2',  0.0634000069218),
              ('f0',  0.0632912268319)],
             col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
     res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
     self.assertTrue(uft.array_equal(ctrl, res))