예제 #1
0
    def test_replace_missing_vals(self):
        M = np.array([('a', 0, 0.0, 0.1),
                      ('b', 1, 1.0, np.nan),
                      ('', -999, np.nan, 0.0),
                      ('d', 1, np.nan, 0.2),
                      ('', -999, 2.0, np.nan)],
                     dtype=[('str', 'O'), ('int', int), ('float1', float),
                            ('float2', float)])

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0])
        ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0])
        res = replace_missing_vals(M, 'constant', constant=-1.0)
        self.assertTrue(np.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([100, 1, -999, 1, -999])
        ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0])
        ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan])
        res = replace_missing_vals(M, 'constant', missing_val=0, constant=100)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([0, 1, 1, 1, 1])
        res = replace_missing_vals(M, 'most_frequent', missing_val=-999)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0])
        ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1])
        res = replace_missing_vals(M, 'mean', missing_val=np.nan)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
예제 #2
0
    def test_replace_missing_vals(self):
        M = np.array([('a', 0, 0.0, 0.1),
                      ('b', 1, 1.0, np.nan),
                      ('', -999, np.nan, 0.0),
                      ('d', 1, np.nan, 0.2),
                      ('', -999, 2.0, np.nan)],
                     dtype=[('str', 'O'), ('int', int), ('float1', float),
                            ('float2', float)])

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0])
        ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0])
        res = replace_missing_vals(M, 'constant', constant=-1.0)
        self.assertTrue(np.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([100, 1, -999, 1, -999])
        ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0])
        ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan])
        res = replace_missing_vals(M, 'constant', missing_val=0, constant=100)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([0, 1, 1, 1, 1])
        res = replace_missing_vals(M, 'most_frequent', missing_val=-999)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0])
        ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1])
        res = replace_missing_vals(M, 'mean', missing_val=np.nan)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
예제 #3
0
    def test_describe_cols(self):
        test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]]
        test_nd = np.array(test_list)
        test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), 
                            (5, 6, 'e'), (6, 7, 'f')], 
                           dtype=[('id', int), ('val', float), ('name', 'S1')])
        ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6),
                              ('f1', 6, 4.5, 1.707825127659933, 2, 7)],
                             dtype=[('Column Name', 'S2'), ('Count', int),
                                    ('Mean', float), ('Standard Dev', float),
                                    ('Minimum', int), ('Maximum', int)])
        ctrl_printout = """
  Column Name Count Mean  Standard Dev Minimum Maximum
0          f0     6  3.5 1.70782512766       1       6
1          f1     6  4.5 1.70782512766       2       7
        """.strip()
        with uft.rerout_stdout() as get_stdout:
            self.assertTrue(uft.array_equal(ctrl_list, 
                                            describe_cols(
                                                test_list)))
            self.assertEqual(get_stdout().strip(), ctrl_printout)
        self.assertTrue(uft.array_equal(ctrl_list, 
                                        describe_cols(
                                            test_nd, verbose=False)))
        ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6),
                            ('val', 6, 4.5, 1.707825127659933, 2, 7),
                            ('name', np.nan, np.nan, np.nan, np.nan, np.nan)],
                           dtype=[('Column Name', 'S4'), ('Count', float),
                                  ('Mean', float), ('Standard Dev', float),
                                  ('Minimum', float), ('Maximum', float)])
        self.assertTrue(uft.array_equal(ctrl_sa, 
                                        describe_cols(
                                            test_sa,
                                            verbose=False)))
예제 #4
0
 def test_array_emitter(self):
     db_file = uft.path_of_data('rg_complex_dates.db')
     ae = array_emitter.ArrayEmitter(convert_to_unix_time=True)
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_sql(self.conn_str,
                             'rg_complex_dates',
                             feature_col='feature')
     res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6,
                                                           30)).emit_M()
     res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12,
                                                           31)).emit_M()
     res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12,
                                                           31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'),
                   ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'),
                   ('unbounded_sum', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3),
                              (ctrl1_dat, ctrl2_dat, ctrl3_dat)):
         self.assertTrue(
             uft.array_equal(res,
                             np.array(ctrl_dat, dtype=ctrl_dtype),
                             idx_col='id'))
예제 #5
0
 def test_multiple_aggr(self):
     db_file = uft.path_of_data('rg_students.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'rg_students')
     ae = ae.set_default_aggregation(['AVG', 'MIN', 'MAX', 'COUNT'])
     ae = ae.set_aggregation('absences', ['MIN', 'MAX'])
     ae = ae.set_aggregation('graduated', 'MAX')
     ae = ae.set_interval(2005, 2007)
     ae = ae.set_label_feature('graduated')
     ae = ae.set_label_interval(2009, 2009)
     res = ae.emit_M()
     ctrl = np.array(
         [(0, 2.2, 2.1, 2.3, 2, 3.95, 3.9, 4.0, 2, 7.0, 8.0, 1.0),
          (1, 3.45, 3.4, 3.5, 2, np.nan, np.nan, np.nan, np.nan, 0.0, 0.0,
           0.0),
          (2, 3.4, 3.4, 3.4, 1.0, np.nan, np.nan, np.nan, np.nan, 14.0,
           96.0, np.nan)],
         dtype=[('id', '<i8'), ('math_gpa_AVG', '<f8'),
                ('math_gpa_MIN', '<f8'), ('math_gpa_MAX', '<f8'),
                ('math_gpa_COUNT', '<i8'), ('english_gpa_AVG', '<f8'),
                ('english_gpa_MIN', '<f8'), ('english_gpa_MAX', '<f8'),
                ('english_gpa_COUNT', '<f8'), ('absences_MIN', '<f8'),
                ('absences_MAX', '<f8'), ('graduated_MAX', '<f8')])
     self.assertTrue(uft.array_equal(res, ctrl))
예제 #6
0
 def test_cast_list_of_list_to_sa(self):
     L = [[None, None, None],
          ['a',  5,    None],
          ['ab', 'x',  None]]
     ctrl = np.array(
             [('', '', ''), 
              ('a', '5', ''),
              ('ab', 'x', '')],
             dtype=[('f0', 'S2'),
                    ('f1', 'S1'),
                    ('f2', 'S1')])
     conv = utils.cast_list_of_list_to_sa(L)
     self.assertTrue(np.array_equal(conv, ctrl))                 
     L = [[None, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7],
          [2, 'hello', np.nan, None],
          [4, None, None, 14L]]
     ctrl = np.array(
             [(-999, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7),
              (2, u'hello', np.nan, -999L),
              (4, u'', np.nan, 14L)],
             dtype=[('int', int), ('ucode', 'U5'), ('float', float),
                    ('long', long)])
     conv = utils.cast_list_of_list_to_sa(
             L, 
             col_names=['int', 'ucode', 'float', 'long'])
     self.assertTrue(utils_for_tests.array_equal(ctrl, conv))
예제 #7
0
 def test_from_csv(self):
     db_file = uft.path_of_data('rg_complex_dates.csv')
     ae = array_emitter.ArrayEmitter()
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_csv(db_file,
                             feature_col='feature',
                             parse_datetimes=['start', 'stop'])
     res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6,
                                                           30)).emit_M()
     res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12,
                                                           31)).emit_M()
     res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12,
                                                           31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_SUM', '<f8'),
                   ('no_start_SUM', '<f8'), ('no_stop_SUM', '<f8'),
                   ('unbounded_SUM', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3),
                              (ctrl1_dat, ctrl2_dat, ctrl3_dat)):
         self.assertTrue(
             uft.array_equal(res, np.array(ctrl_dat, dtype=ctrl_dtype)))
예제 #8
0
 def test_from_csv(self):
     db_file = uft.path_of_data('rg_complex_dates.csv')
     ae = array_emitter.ArrayEmitter()
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_csv(db_file, feature_col='feature',
                             parse_datetimes=['start', 'stop'])
     res1 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 6, 30)).emit_M()
     res2 = ae.set_interval(
         datetime(2010, 7, 1), 
         datetime(2010, 12, 31)).emit_M()
     res3 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 12, 31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded', '<f8'), 
                   ('no_start', '<f8'), ('no_stop', '<f8'), 
                   ('unbounded', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07), 
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, 
                                                   ctrl3_dat)):
         self.assertTrue(uft.array_equal(
             res, 
             np.array(ctrl_dat, dtype=ctrl_dtype)))  
예제 #9
0
 def test_array_emitter(self):
     db_file = uft.path_of_data('rg_complex_dates.db')
     ae = array_emitter.ArrayEmitter(convert_to_unix_time=True)
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_sql(self.conn_str, 'rg_complex_dates', 
                             feature_col='feature')
     res1 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 6, 30)).emit_M()
     res2 = ae.set_interval(
         datetime(2010, 7, 1), 
         datetime(2010, 12, 31)).emit_M()
     res3 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 12, 31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'), 
                   ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'), 
                   ('unbounded_sum', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07), 
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, 
                                                   ctrl3_dat)):
         self.assertTrue(uft.array_equal(
             res, 
             np.array(ctrl_dat, dtype=ctrl_dtype), idx_col='id'))  
예제 #10
0
 def test_cast_list_of_list_to_sa2(self):
     L = [[None, None, None],
          ['a',  5,    None],
          ['ab', 'x',  None]]
     ctrl = np.array(
             [('', '', ''), 
              ('a', '5', ''),
              ('ab', 'x', '')],
             dtype=[('f0', 'S2'),
                    ('f1', 'S1'),
                    ('f2', 'S1')])
     conv = utils.cast_list_of_list_to_sa(L)
     self.assertTrue(np.array_equal(conv, ctrl))                 
     L = [[None, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7],
          [2, 'hello', np.nan, None],
          [4, None, None, 14L]]
     ctrl = np.array(
             [(-999, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7),
              (2, u'hello', np.nan, -999L),
              (4, u'', np.nan, 14L)],
             dtype=[('int', int), ('ucode', 'U5'), ('float', float),
                    ('long', long)])
     conv = utils.cast_list_of_list_to_sa(
             L, 
             col_names=['int', 'ucode', 'float', 'long'])
     self.assertTrue(uft.array_equal(ctrl, conv))
예제 #11
0
    def test_convert_to_sa(self):
        # already a structured array
        sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)),
                       (2, 2.0, 'b', datetime(2016, 01, 01))],
                      dtype=[('int', int), ('float', float), ('str', 'S1'),
                             ('date', 'M8[s]')])
        self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa)))

        # homogeneous array no col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('f0', int), ('f1', int), ('f2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd)))

        # homogeneous array with col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('i0', int), ('i1', int), ('i2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(
            nd,
            col_names=['i0', 'i1', 'i2'])))

        # list of lists no col name provided
        lol = [[1, 1, None],
               ['abc', 2, 3.4]]
        ctrl = np.array([('1', 1, np.nan),
                         ('abc', 2, 3.4)],
                        dtype=[('f0', 'S3'), ('f1', int), ('f2', float)])
        res = utils.convert_to_sa(lol)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        # list of lists with col name provided
        lol = [['hello', 1.2, datetime(2012, 1, 1), None],
               [1.3, np.nan, None, '2013-01-01'],
               [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']]
        ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME),
                         ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)),
                         ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)],
                        dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'),
                               ('i3', 'M8[us]')])
        res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3'])
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
예제 #12
0
    def test_convert_to_sa(self):
        # already a structured array
        sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)),
                       (2, 2.0, 'b', datetime(2016, 01, 01))],
                      dtype=[('int', int), ('float', float), ('str', 'O'),
                             ('date', 'M8[s]')])
        self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa)))

        # homogeneous array no col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('f0', int), ('f1', int), ('f2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd)))

        # homogeneous array with col names provided
        nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                        dtype=[('i0', int), ('i1', int), ('i2', int)])
        self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(
            nd,
            col_names=['i0', 'i1', 'i2'])))

        # list of lists no col name provided
        lol = [[1, 1, None],
               ['abc', 2, 3.4]]
        ctrl = np.array([('1', 1, np.nan),
                         ('abc', 2, 3.4)],
                        dtype=[('f0', 'S3'), ('f1', int), ('f2', float)])
        res = utils.convert_to_sa(lol)
        self.assertTrue(uft.array_equal(ctrl, res))

        # list of lists with col name provided
        lol = [['hello', 1.2, datetime(2012, 1, 1), None],
               [1.3, np.nan, None, '2013-01-01'],
               [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']]
        ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME),
                         ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)),
                         ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)],
                        dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'),
                               ('i3', 'M8[us]')])
        res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3'])
        self.assertTrue(uft.array_equal(ctrl, res))
예제 #13
0
    def test_get_top_features(self):
        M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
        M = utils.cast_np_sa_to_nd(M)
        M_train, M_test, labels_train, labels_test = train_test_split(
                M, 
                labels)
        clf = RandomForestClassifier(random_state=0)
        clf.fit(M_train, labels_train)

        ctrl_feat_importances = clf.feature_importances_
        ctrl_col_names = ['f{}'.format(i) for i in xrange(15)]
        ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10]
        ctrl = utils.convert_to_sa(
                zip(ctrl_col_names, ctrl_feat_importances),
                col_names=('feat_name', 'score'))[ctrl_feat_ranks]

        res = dsp.get_top_features(clf, M, verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))

        res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
        self.assertTrue(uft.array_equal(ctrl, res))
예제 #14
0
 def test_describe_cols(self):
     test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]]
     test_nd = np.array(test_list)
     test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), 
                         (5, 6, 'e'), (6, 7, 'f')], 
                        dtype=[('id', int), ('val', float), ('name', 'S1')])
     ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6),
                           ('f1', 6, 4.5, 1.707825127659933, 2, 7)],
                          dtype=[('Column Name', 'S2'), ('Count', int),
                                 ('Mean', float), ('Standard Dev', float),
                                 ('Minimum', int), ('Maximum', int)])
     self.assertTrue(utils_for_tests.array_equal(ctrl_list, 
                                                 describe_cols(test_list)))
     self.assertTrue(utils_for_tests.array_equal(ctrl_list, 
                                                 describe_cols(test_nd)))
     ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6),
                         ('val', 6, 4.5, 1.707825127659933, 2, 7),
                         ('name', np.nan, np.nan, np.nan, np.nan, np.nan)],
                        dtype=[('Column Name', 'S4'), ('Count', float),
                               ('Mean', float), ('Standard Dev', float),
                               ('Minimum', float), ('Maximum', float)])
     self.assertTrue(utils_for_tests.array_equal(ctrl_sa, 
                                                 describe_cols(test_sa)))
예제 #15
0
 def test_describe_cols(self):
     test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]]
     test_nd = np.array(test_list)
     test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), 
                         (5, 6, 'e'), (6, 7, 'f')], 
                        dtype=[('id', int), ('val', float), ('name', 'S1')])
     ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6),
                           ('f1', 6, 4.5, 1.707825127659933, 2, 7)],
                          dtype=[('Column Name', 'S2'), ('Count', int),
                                 ('Mean', float), ('Standard Dev', float),
                                 ('Minimum', int), ('Maximum', int)])
     self.assertTrue(uft.array_equal(ctrl_list, 
                                     describe_cols(test_list)))
     self.assertTrue(uft.array_equal(ctrl_list, 
                                     describe_cols(test_nd)))
     ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6),
                         ('val', 6, 4.5, 1.707825127659933, 2, 7),
                         ('name', np.nan, np.nan, np.nan, np.nan, np.nan)],
                        dtype=[('Column Name', 'S4'), ('Count', float),
                               ('Mean', float), ('Standard Dev', float),
                               ('Minimum', float), ('Maximum', float)])
     self.assertTrue(uft.array_equal(ctrl_sa, 
                                     describe_cols(test_sa)))
예제 #16
0
 def test_basic(self):
     db_file = uft.path_of_data('rg_students.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'rg_students')
     ae = ae.set_aggregation('absences', 'MAX')
     ae = ae.set_interval(2005, 2007)
     res = ae.emit_M()
     ctrl = np.array([(0, 2.2, 3.95, 8.0),
                      (1, 3.45, np.nan, 0.0),
                      (2, 3.4, np.nan, 96.0)],
                     dtype=[('id', '<i8'), ('math_gpa', '<f8'), 
                            ('english_gpa', '<f8'), 
                            ('absences', '<f8')])
     self.assertTrue(uft.array_equal(res, ctrl))
예제 #17
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
             M, 
             labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = dsp.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa(
             [('f5',  0.0773838526068), 
              ('f13',   0.0769596713039),
              ('f8',  0.0751584839431),
              ('f6',  0.0730815879102),
              ('f11',   0.0684456133071),
              ('f9',  0.0666747414603),
              ('f10',   0.0659621889608),
              ('f7',  0.0657988099065),
              ('f2',  0.0634000069218),
              ('f0',  0.0632912268319)],
             col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
     res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False)
     self.assertTrue(uft.array_equal(ctrl, res))
예제 #18
0
    def test_table(self):
        data = np.array(['a', 'b', 'a', 'b', 'b', 'b', 'b', 'a', 'c', 'c', 
                         'b', 'c', 'a'], dtype='O')
        ctrl_sa = np.array(
                [('a', 4), ('b', 6), ('c', 3)],
                dtype=[('col_name', 'S1'), ('count', int)])           
        ctrl_printout = """
  col_name count
0        a     4
1        b     6
2        c     3
        """.strip()
        with uft.rerout_stdout() as get_stdout:
            self.assertTrue(uft.array_equal(ctrl_sa, 
                                            table(data)))
            self.assertEqual(get_stdout().strip(), ctrl_printout)
예제 #19
0
 def test_get_top_features(self):
     M, labels = uft.generate_test_matrix(1000, 15, random_state=0)
     M = utils.cast_np_sa_to_nd(M)
     M_train, M_test, labels_train, labels_test = train_test_split(
         M, labels)
     clf = RandomForestClassifier(random_state=0)
     clf.fit(M_train, labels_train)
     res = comm.get_top_features(clf, M, verbose=False)
     ctrl = utils.convert_to_sa([('f5', 0.0773838526068),
                                 ('f13', 0.0769596713039),
                                 ('f8', 0.0751584839431),
                                 ('f6', 0.0730815879102),
                                 ('f11', 0.0684456133071),
                                 ('f9', 0.0666747414603),
                                 ('f10', 0.0659621889608),
                                 ('f7', 0.0657988099065),
                                 ('f2', 0.0634000069218),
                                 ('f0', 0.0632912268319)],
                                col_names=('feat_name', 'score'))
     self.assertTrue(uft.array_equal(ctrl, res))
예제 #20
0
    def test_join(self):
        # test basic inner join
        a1 = np.array([(0, 'Lisa', 2),
                       (1, 'Bill', 1),
                       (2, 'Fred', 2),
                       (3, 'Samantha', 2),
                       (4, 'Augustine', 1),
                       (5, 'William', 0)], dtype=[('id', int),
                                                  ('name', 'O'),
                                                  ('dept_id', int)])
        a2 = np.array([(0, 'accts receivable'),
                       (1, 'accts payable'),
                       (2, 'shipping')], dtype=[('id', int),
                                                ('name', 'S16')])
        ctrl = pd.DataFrame(a1).merge(
                    pd.DataFrame(a2),
                    left_on='dept_id',
                    right_on='id').to_records(index=False)
        res = utils.join(a1, a2, 'inner', 'dept_id', 'id')
        self.assertTrue(uft.array_equal(ctrl, res, idx_col='id_x'))

        # test column naming rules
        a1 = np.array([(0, 'a', 1, 2, 3)], dtype=[('idx0', int),
                                    ('name', 'O'),
                                    ('a1_idx1', int),
                                    ('idx2', int),
                                    ('idx3', int)])
        a2 = np.array([(0, 'b', 1, 2, 3)], dtype=[('idx0', int),
                                            ('name', 'O'),
                                            ('a2_idx1', int),
                                            ('idx2', int),
                                            ('idx3', int)])
        pd1 = pd.DataFrame(a1)
        pd2 = pd.DataFrame(a2)
        ctrl = pd1.merge(
                pd2, 
                left_on=['idx0', 'a1_idx1', 'idx2'], 
                right_on=['idx0', 'a2_idx1', 'idx2'],
                suffixes=['_left', '_right']).to_records(index=False)
        res = utils.join(
                a1,
                a2, 
                'inner',
                left_on=['idx0', 'a1_idx1', 'idx2'], 
                right_on=['idx0', 'a2_idx1', 'idx2'],
                suffixes=['_left', '_right'])
        self.assertTrue(uft.array_equal(ctrl, res, idx_col='idx0'))

        # outer joins
        a1 = np.array(
            [(0, 'a1_0', 0),
             (1, 'a1_1', 1),
             (1, 'a1_2', 2),
             (2, 'a1_3', 3),
             (3, 'a1_4', 4)], 
            dtype=[('key', int), ('label', 'O'), ('idx', int)])
        a2 = np.array(
            [(0, 'a2_0', 0),
             (1, 'a2_1', 1),
             (2, 'a2_2', 2),
             (2, 'a2_3', 3),
             (4, 'a2_4', 4)], 
            dtype=[('key', int), ('label', 'O'), ('idx', int)])
        #for how in ('inner', 'left', 'right', 'outer'):
        merged_dtype = [('key', int), ('label_x', 'O'), ('idx_x', int),
                        ('label_y', 'O'), ('idx_y', int)]
        merge_algos = ('inner', 'left', 'right', 'outer')
        merged_data = [[(0, 'a1_0', 0, 'a2_0', 0),
                        (1, 'a1_1', 1, 'a2_1', 1),
                        (1, 'a1_2', 2, 'a2_1', 1),
                        (2, 'a1_3', 3, 'a2_2', 2),
                        (2, 'a1_3', 3, 'a2_3', 3)],
                       [(0, 'a1_0', 0, 'a2_0', 0),
                        (1, 'a1_1', 1, 'a2_1', 1),
                        (1, 'a1_2', 2, 'a2_1', 1),
                        (2, 'a1_3', 3, 'a2_2', 2),
                        (2, 'a1_3', 3, 'a2_3', 3),
                        (3, 'a1_4', 4, '', -999)], 
                       [(0, 'a1_0', 0, 'a2_0', 0),
                        (1, 'a1_1', 1, 'a2_1', 1),
                        (1, 'a1_2', 2, 'a2_1', 1),
                        (2, 'a1_3', 3, 'a2_2', 2),
                        (2, 'a1_3', 3, 'a2_3', 3),
                        (4, '', -999, 'a2_4', 4)], 
                       [(0, 'a1_0', 0, 'a2_0', 0),
                        (1, 'a1_1', 1, 'a2_1', 1),
                        (1, 'a1_2', 2, 'a2_1', 1),
                        (2, 'a1_3', 3, 'a2_2', 2),
                        (2, 'a1_3', 3, 'a2_3', 3),
                        (4, '', -999, 'a2_4', 4), 
                        (3, 'a1_4', 4, '', -999)]] 
        for how, data in zip(merge_algos, merged_data):
            res = utils.join(
                    a1,
                    a2, 
                    how,
                    left_on='key',
                    right_on='key')
            ctrl = np.array(data, dtype=merged_dtype)
            self.assertTrue(uft.array_equal(ctrl, res))