Пример #1
0
 def test_plot_correlation_scatter_plot(self):
     col1 = range(10)
     col2 = [cell * 3 + 1 for cell in col1]
     col3 = [1, 5, 8, 4, 1, 8, 5, 9, 0, 1]
     sa = utils.convert_to_sa(
         zip(col1, col2, col3),
         col_names=['base', 'linear_trans', 'no_correlation'])
     fig = comm.plot_correlation_scatter_plot(sa, verbose=False)
     self.add_fig_to_report(fig, 'plot_correlation_scatter_plot')
Пример #2
0
 def test_plot_correlation_scatter_plot(self):
     col1 = range(10)
     col2 = [cell * 3 + 1 for cell in col1]
     col3 = [1, 5, 8, 4, 1, 8, 5, 9, 0, 1]
     sa = utils.convert_to_sa(
             zip(col1, col2, col3), 
             col_names=['base', 'linear_trans', 'no_correlation'])
     fig = comm.plot_correlation_scatter_plot(sa, verbose=False)
     self.add_fig_to_report(fig, 'plot_correlation_scatter_plot')
Пример #3
0
    def test_convert_to_sa(self):
        # already a structured array
        sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)),
                       (2, 2.0, 'b', datetime(2016, 01, 01))],
                      dtype=[('int', int), ('float', float), ('str', 'S1'),
                             ('date', 'M8[s]')])
        self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa)))

        # homogeneous array no col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('f0', int), ('f1', int), ('f2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd)))

        # homogeneous array with col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('i0', int), ('i1', int), ('i2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(
            nd,
            col_names=['i0', 'i1', 'i2'])))

        # list of lists no col name provided
        lol = [[1, 1, None],
               ['abc', 2, 3.4]]
        ctrl = np.array([('1', 1, np.nan),
                         ('abc', 2, 3.4)],
                        dtype=[('f0', 'S3'), ('f1', int), ('f2', float)])
        res = utils.convert_to_sa(lol)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        # list of lists with col name provided
        lol = [['hello', 1.2, datetime(2012, 1, 1), None],
               [1.3, np.nan, None, '2013-01-01'],
               [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']]
        ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME),
                         ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)),
                         ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)],
                        dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'),
                               ('i3', 'M8[us]')])
        res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3'])
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
Пример #4
0
    def test_convert_to_sa(self):
        # already a structured array
        sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)),
                       (2, 2.0, 'b', datetime(2016, 01, 01))],
                      dtype=[('int', int), ('float', float), ('str', 'S1'),
                             ('date', 'M8[s]')])
        self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa)))

        # homogeneous array no col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('f0', int), ('f1', int), ('f2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd)))

        # homogeneous array with col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('i0', int), ('i1', int), ('i2', int)])
        self.assertTrue(
            np.array_equal(
                ctrl, utils.convert_to_sa(nd, col_names=['i0', 'i1', 'i2'])))

        # list of lists no col name provided
        lol = [[1, 1, None], ['abc', 2, 3.4]]
        ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)],
                        dtype=[('f0', 'S3'), ('f1', int), ('f2', float)])
        res = utils.convert_to_sa(lol)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        # list of lists with col name provided
        lol = [['hello', 1.2, datetime(2012, 1, 1), None],
               [1.3, np.nan, None, '2013-01-01'],
               [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']]
        ctrl = np.array(
            [('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME),
             ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)),
             ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)],
            dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'),
                   ('i3', 'M8[us]')])
        res = utils.convert_to_sa(lol, col_names=['i0', 'i1', 'i2', 'i3'])
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
Пример #5
0
def label_encode(M):
    """
    Changes string cols to integers so that there is a 1-1 mapping between 
    strings and ints
    """

    M = convert_to_sa(M)
    le = preprocessing.LabelEncoder()
    new_dtype = []
    result_arrays = []
    for (col_name, fmt) in M.dtype.descr:
        if 'S' in fmt:
            result_arrays.append(le.fit_transform(M[col_name]))
            new_dtype.append((col_name, int))
        else:
            result_arrays.append(M[col_name])
            new_dtype.append((col_name, fmt))
    return np.array(zip(*result_arrays), dtype=new_dtype)
Пример #6
0
def label_encode(M):
    """
    Changes string cols to integers so that there is a 1-1 mapping between 
    strings and ints
    """

    M = convert_to_sa(M)
    le = preprocessing.LabelEncoder()
    new_dtype = []
    result_arrays = []
    for (col_name, fmt) in M.dtype.descr:
        if 'S' in fmt:
            result_arrays.append(le.fit_transform(M[col_name]))
            new_dtype.append((col_name, int))
        else:
            result_arrays.append(M[col_name])
            new_dtype.append((col_name, fmt))
    return np.array(zip(*result_arrays), dtype=new_dtype)
Пример #7
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
         M, labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = comm.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa([('f5', 0.0773838526068),
                                 ('f13', 0.0769596713039),
                                 ('f8', 0.0751584839431),
                                 ('f6', 0.0730815879102),
                                 ('f11', 0.0684456133071),
                                 ('f9', 0.0666747414603),
                                 ('f10', 0.0659621889608),
                                 ('f7', 0.0657988099065),
                                 ('f2', 0.0634000069218),
                                 ('f0', 0.0632912268319)],
                                col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
Пример #8
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
             M, 
             labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = comm.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa(
             [('f5',  0.0773838526068), 
              ('f13',   0.0769596713039),
              ('f8',  0.0751584839431),
              ('f6',  0.0730815879102),
              ('f11',   0.0684456133071),
              ('f9',  0.0666747414603),
              ('f10',   0.0659621889608),
              ('f7',  0.0657988099065),
              ('f2',  0.0634000069218),
              ('f0',  0.0632912268319)],
             col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
Пример #9
0
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0):
    # TODO support times, strings
    M = convert_to_sa(M)

    if strategy not in ['mean', 'median', 'most_frequent', 'constant']:
        raise ValueError('Invalid strategy')

    M_cp = M.copy()

    if strategy == 'constant':

        try:
            missing_is_nan = np.isnan(missing_val)
        except TypeError:
            # missing_val is not a float
            missing_is_nan = False

        if missing_is_nan:  # we need to be careful about handling nan
            for col_name, col_type in M_cp.dtype.descr:
                if 'f' in col_type:
                    col = M_cp[col_name]
                    col[np.isnan(col)] = constant
            return M_cp

        for col_name, col_type in M_cp.dtype.descr:
            if 'i' in col_type or 'f' in col_type:
                col = M_cp[col_name]
                col[col == missing_val] = constant
        return M_cp

    # we're doing one of the sklearn imputer strategies
    imp = Imputer(missing_values=missing_val, strategy=strategy, axis=1)
    for col_name, col_type in M_cp.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            # The Imputer only works on float and int columns
            col = M_cp[col_name]
            col[:] = imp.fit_transform(col)
    return M_cp
Пример #10
0
def replace_missing_vals(M, strategy, missing_val=np.nan, constant=0):
    # TODO support times, strings
    M = convert_to_sa(M)

    if strategy not in ['mean', 'median', 'most_frequent', 'constant']:
        raise ValueError('Invalid strategy')

    M_cp = M.copy()

    if strategy == 'constant':

        try:
            missing_is_nan = np.isnan(missing_val)
        except TypeError:
            # missing_val is not a float
            missing_is_nan = False

        if missing_is_nan: # we need to be careful about handling nan
            for col_name, col_type in M_cp.dtype.descr:
                if 'f' in col_type:
                    col = M_cp[col_name]
                    col[np.isnan(col)] = constant
            return M_cp        

        for col_name, col_type in M_cp.dtype.descr:
            if 'i' in col_type or 'f' in col_type:
                col = M_cp[col_name]
                col[col == missing_val] = constant
        return M_cp

    # we're doing one of the sklearn imputer strategies
    imp = Imputer(missing_values=missing_val, strategy=strategy, axis=1)
    for col_name, col_type in M_cp.dtype.descr:
        if 'f' in col_type or 'i' in col_type:
            # The Imputer only works on float and int columns
            col = M_cp[col_name]
            col[:] = imp.fit_transform(col)
    return M_cp