Пример #1
0
 def test_array_emitter(self):
     db_file = uft.path_of_data('rg_complex_dates.db')
     ae = array_emitter.ArrayEmitter(convert_to_unix_time=True)
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_sql(self.conn_str,
                             'rg_complex_dates',
                             feature_col='feature')
     res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6,
                                                           30)).emit_M()
     res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12,
                                                           31)).emit_M()
     res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12,
                                                           31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'),
                   ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'),
                   ('unbounded_sum', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3),
                              (ctrl1_dat, ctrl2_dat, ctrl3_dat)):
         self.assertTrue(
             uft.array_equal(res,
                             np.array(ctrl_dat, dtype=ctrl_dtype),
                             idx_col='id'))
Пример #2
0
 def test_array_emitter(self):
     db_file = uft.path_of_data('rg_complex_dates.db')
     ae = array_emitter.ArrayEmitter(convert_to_unix_time=True)
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_sql(self.conn_str, 'rg_complex_dates', 
                             feature_col='feature')
     res1 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 6, 30)).emit_M()
     res2 = ae.set_interval(
         datetime(2010, 7, 1), 
         datetime(2010, 12, 31)).emit_M()
     res3 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 12, 31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'), 
                   ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'), 
                   ('unbounded_sum', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07), 
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, 
                                                   ctrl3_dat)):
         self.assertTrue(uft.array_equal(
             res, 
             np.array(ctrl_dat, dtype=ctrl_dtype), idx_col='id'))  
Пример #3
0
 def test_from_csv(self):
     db_file = uft.path_of_data('rg_complex_dates.csv')
     ae = array_emitter.ArrayEmitter()
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_csv(db_file,
                             feature_col='feature',
                             parse_datetimes=['start', 'stop'])
     res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6,
                                                           30)).emit_M()
     res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12,
                                                           31)).emit_M()
     res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12,
                                                           31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded_SUM', '<f8'),
                   ('no_start_SUM', '<f8'), ('no_stop_SUM', '<f8'),
                   ('unbounded_SUM', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3),
                              (ctrl1_dat, ctrl2_dat, ctrl3_dat)):
         self.assertTrue(
             uft.array_equal(res, np.array(ctrl_dat, dtype=ctrl_dtype)))
Пример #4
0
 def test_subset_over(self):
     db_file = uft.path_of_data('rg_subset_over.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'subset_over')
     ae = ae.set_default_aggregation('SUM')
     exp = ae.subset_over(label_col='label',
                          label_col_aggr_of_interest='SUM',
                          interval_train_window_start=2004,
                          interval_train_window_end=2005,
                          interval_test_window_start=2006,
                          interval_test_window_end=2007,
                          interval_inc_value=1,
                          interval_expanding=False,
                          row_M_col_name='cohort',
                          row_M_col_aggr_of_interest='SUM',
                          row_M_train_window_start=2008,
                          row_M_train_window_end=2008,
                          row_M_test_window_start=2009,
                          row_M_test_window_end=2009,
                          row_M_inc_value=1,
                          row_M_expanding=False,
                          clfs=DBG_std_clfs)
     exp.make_report(verbose=False)
     exp.make_csv()
Пример #5
0
 def test_multiple_aggr(self):
     db_file = uft.path_of_data('rg_students.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'rg_students')
     ae = ae.set_default_aggregation(['AVG', 'MIN', 'MAX', 'COUNT'])
     ae = ae.set_aggregation('absences', ['MIN', 'MAX'])
     ae = ae.set_aggregation('graduated', 'MAX')
     ae = ae.set_interval(2005, 2007)
     ae = ae.set_label_feature('graduated')
     ae = ae.set_label_interval(2009, 2009)
     res = ae.emit_M()
     ctrl = np.array(
         [(0, 2.2, 2.1, 2.3, 2, 3.95, 3.9, 4.0, 2, 7.0, 8.0, 1.0),
          (1, 3.45, 3.4, 3.5, 2, np.nan, np.nan, np.nan, np.nan, 0.0, 0.0,
           0.0),
          (2, 3.4, 3.4, 3.4, 1.0, np.nan, np.nan, np.nan, np.nan, 14.0,
           96.0, np.nan)],
         dtype=[('id', '<i8'), ('math_gpa_AVG', '<f8'),
                ('math_gpa_MIN', '<f8'), ('math_gpa_MAX', '<f8'),
                ('math_gpa_COUNT', '<i8'), ('english_gpa_AVG', '<f8'),
                ('english_gpa_MIN', '<f8'), ('english_gpa_MAX', '<f8'),
                ('english_gpa_COUNT', '<f8'), ('absences_MIN', '<f8'),
                ('absences_MAX', '<f8'), ('graduated_MAX', '<f8')])
     self.assertTrue(uft.array_equal(res, ctrl))
Пример #6
0
 def test_select_rows_in_M(self):
     db_file = uft.path_of_data('rg_select_rows_in_M.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'select_rows_in_M')
     ae = ae.set_default_aggregation('SUM')
     ae_1 = ae.set_interval(2005, 2006)
     ae_1 = ae_1.select_rows_in_M('cohort_SUM = 2009')
     ae_2 = ae.set_interval(2005, 2007)
     ae_2 = ae_2.select_rows_in_M('cohort_SUM = 2010')
     ae_1_1 = ae_1.select_rows_in_M('took_ap_compsci_SUM')
     ae_1_2 = ae_1.select_rows_in_M('NOT took_ap_compsci_SUM')
     ae_2_1 = ae_2.select_rows_in_M('took_ap_compsci_SUM')
     ae_2_2 = ae_2.select_rows_in_M('NOT took_ap_compsci_SUM')
     ctrl_dtype = [('id', '<i8'), ('math_gpa_SUM', '<f8'),
                   ('english_gpa_SUM', '<f8'), ('absences_SUM', '<f8'),
                   ('cohort_SUM', '<f8'), ('took_ap_compsci_SUM', '<f8')]
     ctrl_data = [[(0, 1.0, 1.0, 1.0, 2009.0, 1.0)],
                  [(2, 3.0, 3.0, 3.0, 2009.0, 0.0)],
                  [(1, 2.2, 2.2, 2.2, 2010.0, 1.0)],
                  [(3, 4.4, 4.4, 4.4, 2010.0, 0.0)]]
     for ae_sel, dat in zip((ae_1_1, ae_1_2, ae_2_1, ae_2_2), ctrl_data):
         ctrl = np.array(dat, dtype=ctrl_dtype)
         res = ae_sel.emit_M()
         self.assertTrue(ctrl, res)
Пример #7
0
 def test_select_rows_in_M(self):
     db_file = uft.path_of_data('rg_select_rows_in_M.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'select_rows_in_M')
     ae = ae.set_default_aggregation('SUM')
     ae_1 = ae.set_interval(2005, 2006)
     ae_1 = ae_1.select_rows_in_M('cohort = 2009')
     ae_2 = ae.set_interval(2005, 2007)
     ae_2 = ae_2.select_rows_in_M('cohort = 2010')
     ae_1_1 = ae_1.select_rows_in_M('took_ap_compsci')
     ae_1_2 = ae_1.select_rows_in_M('NOT took_ap_compsci')
     ae_2_1 = ae_2.select_rows_in_M('took_ap_compsci')
     ae_2_2 = ae_2.select_rows_in_M('NOT took_ap_compsci')
     ctrl_dtype = [('id', '<i8'), ('math_gpa', '<f8'), 
                   ('english_gpa', '<f8'), ('absences', '<f8'), 
                   ('cohort', '<f8'), ('took_ap_compsci', '<f8')]
     ctrl_data = [[(0, 1.0, 1.0, 1.0, 2009.0, 1.0)],
                  [(2, 3.0, 3.0, 3.0, 2009.0, 0.0)],
                  [(1, 2.2, 2.2, 2.2, 2010.0, 1.0)],
                  [(3, 4.4, 4.4, 4.4, 2010.0, 0.0)]]
     for ae_sel, dat in zip((ae_1_1, ae_1_2, ae_2_1, ae_2_2), ctrl_data):
         ctrl = np.array(dat, dtype=ctrl_dtype)
         res = ae_sel.emit_M()
         self.assertTrue(ctrl, res)
Пример #8
0
 def test_from_csv(self):
     db_file = uft.path_of_data('rg_complex_dates.csv')
     ae = array_emitter.ArrayEmitter()
     ae = ae.set_aggregation('bounded', 'SUM')
     ae = ae.set_aggregation('no_start', 'SUM')
     ae = ae.set_aggregation('no_stop', 'SUM')
     ae = ae.set_aggregation('unbounded', 'SUM')
     ae = ae.get_rg_from_csv(db_file, feature_col='feature',
                             parse_datetimes=['start', 'stop'])
     res1 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 6, 30)).emit_M()
     res2 = ae.set_interval(
         datetime(2010, 7, 1), 
         datetime(2010, 12, 31)).emit_M()
     res3 = ae.set_interval(
         datetime(2010, 1, 1), 
         datetime(2010, 12, 31)).emit_M()
     ctrl_dtype = [('id', '<i8'), ('bounded', '<f8'), 
                   ('no_start', '<f8'), ('no_stop', '<f8'), 
                   ('unbounded', '<f8')]
     ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0),
                  (1, 0.01, 0.001, 1e-06, 1e-07), 
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0),
                  (1, 0.1, 0.0001, 1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0),
                  (1, 0.11, 0.0011, 1.1e-05, 1e-07),
                  (2, np.nan, np.nan, np.nan, 2e-08)]
     for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, 
                                                   ctrl3_dat)):
         self.assertTrue(uft.array_equal(
             res, 
             np.array(ctrl_dat, dtype=ctrl_dtype)))  
Пример #9
0
 def test_basic(self):
     db_file = uft.path_of_data('rg_students.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'rg_students')
     ae = ae.set_aggregation('absences', 'MAX')
     ae = ae.set_interval(2005, 2007)
     res = ae.emit_M()
     ctrl = np.array([(0, 2.2, 3.95, 8.0),
                      (1, 3.45, np.nan, 0.0),
                      (2, 3.4, np.nan, 96.0)],
                     dtype=[('id', '<i8'), ('math_gpa', '<f8'), 
                            ('english_gpa', '<f8'), 
                            ('absences', '<f8')])
     self.assertTrue(uft.array_equal(res, ctrl))
Пример #10
0
 def setUpClass(cls):
     cls.skip = False
     try:
         pgres_host = os.environ['DIOGENES_PGRES_TEST_HOST']
         pgres_db = os.environ['DIOGENES_PGRES_TEST_DB']
         pgres_user = os.environ['DIOGENES_PGRES_TEST_USER']
         pgres_pw = os.environ['DIOGENES_PGRES_TEST_PW']
     except KeyError:
         uft.print_in_box(
             'Skipping TestPgres',
             ('TestPgres requires the following environmental variables '
              'to be defined:\n'
              '* DIOGENES_PGRES_TEST_HOST\n'
              '* DIOGENES_PGRES_TEST_DB\n'
              '* DIOGENES_PGRES_TEST_USER\n'
              '* DIOGENES_PGRES_TEST_PW\n'
              '* DIOGENES_PGRES_TEST_PORT (optional)\n'
              'At least one of these is not defined. Skipping TestPgres'))
         cls.skip = True
         return
     try:
         pgres_port = os.environ['DIOGENES_PGRES_TEST_PORT']
     except KeyError:
         pgres_port = '5432'
     os.environ['PGPASSWORD'] = pgres_pw
     if subprocess.call(['psql', '-h', pgres_host, '-d', pgres_db, '-U',
                         pgres_user, '-p', pgres_port, '-f', 
                         uft.path_of_data('populate_pgres.sql')]):
         uft.print_in_box(
             'Skipping TestPgres',
             ('Could not populate database.\n'
              'Perhaps Postgres server is not running or the following\n'
              'environmental variables are defined incorrectly:\n'
              '* DIOGENES_PGRES_TEST_HOST\n'
              '* DIOGENES_PGRES_TEST_DB\n'
              '* DIOGENES_PGRES_TEST_USER\n'
              '* DIOGENES_PGRES_TEST_PW\n'
              '* DIOGENES_PGRES_TEST_PORT (optional)\n'
              'Skipping TestPgres'))
         cls.skip = True
         return
     cls.conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(
             pgres_user,
             pgres_pw,
             pgres_host,
             pgres_port,
             pgres_db)
Пример #11
0
 def setUpClass(cls):
     cls.skip = False
     try:
         pgres_host = os.environ['DIOGENES_PGRES_TEST_HOST']
         pgres_db = os.environ['DIOGENES_PGRES_TEST_DB']
         pgres_user = os.environ['DIOGENES_PGRES_TEST_USER']
         pgres_pw = os.environ['DIOGENES_PGRES_TEST_PW']
     except KeyError:
         uft.print_in_box(
             'Skipping TestPgres',
             ('TestPgres requires the following environmental variables '
              'to be defined:\n'
              '* DIOGENES_PGRES_TEST_HOST\n'
              '* DIOGENES_PGRES_TEST_DB\n'
              '* DIOGENES_PGRES_TEST_USER\n'
              '* DIOGENES_PGRES_TEST_PW\n'
              '* DIOGENES_PGRES_TEST_PORT (optional)\n'
              'At least one of these is not defined. Skipping TestPgres'))
         cls.skip = True
         return
     try:
         pgres_port = os.environ['DIOGENES_PGRES_TEST_PORT']
     except KeyError:
         pgres_port = '5432'
     os.environ['PGPASSWORD'] = pgres_pw
     if subprocess.call([
             'psql', '-h', pgres_host, '-d', pgres_db, '-U', pgres_user,
             '-p', pgres_port, '-f',
             uft.path_of_data('populate_pgres.sql')
     ]):
         uft.print_in_box(
             'Skipping TestPgres',
             ('Could not populate database.\n'
              'Perhaps Postgres server is not running or the following\n'
              'environmental variables are defined incorrectly:\n'
              '* DIOGENES_PGRES_TEST_HOST\n'
              '* DIOGENES_PGRES_TEST_DB\n'
              '* DIOGENES_PGRES_TEST_USER\n'
              '* DIOGENES_PGRES_TEST_PW\n'
              '* DIOGENES_PGRES_TEST_PORT (optional)\n'
              'Skipping TestPgres'))
         cls.skip = True
         return
     cls.conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(
         pgres_user, pgres_pw, pgres_host, pgres_port, pgres_db)
Пример #12
0
    def test_feature_gen_lambda(self):
        def feature_gen(M, labels, test_or_train, interval_start, interval_end,
                        label_interval_start, label_interval_end, row_M_start,
                        row_M_end):
            return (append_cols(
                M, M['relevent_feature_SUM'] *
                2 if test_or_train == 'test' else M['relevent_feature_SUM'] *
                3, 'mult'), labels)

        db_file = uft.path_of_data('rg_subset_over.db')
        conn_str = 'sqlite:///{}'.format(db_file)
        ae = array_emitter.ArrayEmitter()
        ae = ae.get_rg_from_sql(conn_str, 'subset_over')
        ae = ae.set_default_aggregation('SUM')
        exp = ae.subset_over(label_col='label',
                             interval_train_window_start=2004,
                             interval_train_window_end=2005,
                             interval_test_window_start=2006,
                             interval_test_window_end=2007,
                             interval_inc_value=1,
                             label_col_aggr_of_interest='SUM',
                             interval_expanding=False,
                             row_M_col_name='cohort',
                             row_M_col_aggr_of_interest='SUM',
                             row_M_train_window_start=2008,
                             row_M_train_window_end=2008,
                             row_M_test_window_start=2009,
                             row_M_test_window_end=2009,
                             row_M_inc_value=1,
                             row_M_expanding=False,
                             clfs=DBG_std_clfs,
                             feature_gen_lambda=feature_gen)
        for run in it.chain.from_iterable(
            [trial.runs_flattened() for trial in exp.trials]):
            relevent_idx = run.col_names.index('relevent_feature_SUM')
            mult_idx = run.col_names.index('mult')
            self.assertTrue(
                np.allclose(run.M[:, relevent_idx] * 3, run.M[:, mult_idx]))
            self.assertTrue(
                np.allclose(run.M_test[:, relevent_idx] * 2,
                            run.M_test[:, mult_idx]))
Пример #13
0
    def test_connect_sql(self):
        conn_str = 'sqlite:///{}'.format(
            utils_for_tests.path_of_data('small.db'))
        conn = read.connect_sql(conn_str)
        sa = conn.execute('SELECT * FROM employees')
        ctrl = np.array([(1, u'Arthur', u'King', 40000.0, 2.1, 10),
                         (2, u'Jones', u'James', 1000000.0, 1.9, 2),
                         (3, u'The Moabite', u'Ruth', 50000.0, 1.8, 6)],
                        dtype=[('id', '<i8'), ('last_name', 'O'),
                               ('first_name', 'O'), ('salary', '<f8'),
                               ('height', '<f8'), ('usefulness', '<i8')])
        self.assertTrue(np.array_equal(sa, ctrl))

        conn = read.connect_sql(conn_str, allow_caching=True)
        sa = conn.execute('SELECT * FROM employees')
        self.assertTrue(np.array_equal(sa, ctrl))
        sa = conn.execute('SELECT id FROM employees')
        ctrl2 = np.array([(1, ), (2, ), (3, )], dtype=[('id', '<i8')])
        self.assertTrue(np.array_equal(sa, ctrl2))
        sa = conn.execute('SELECT * FROM employees')
        self.assertTrue(np.array_equal(sa, ctrl))
Пример #14
0
    def test_connect_sql(self):
        conn_str = 'sqlite:///{}'.format(utils_for_tests.path_of_data('small.db'))
        conn = read.connect_sql(conn_str)
        sa = conn.execute('SELECT * FROM employees')
        ctrl = np.array([(1, u'Arthur', u'King', 40000.0, 2.1, 10),
                         (2, u'Jones', u'James', 1000000.0, 1.9, 2),
                         (3, u'The Moabite', u'Ruth', 50000.0, 1.8, 6)],
                        dtype=[('id', '<i8'), ('last_name', 'O'), 
                               ('first_name', 'O'), 
                               ('salary', '<f8'), ('height', '<f8'), 
                               ('usefulness', '<i8')])
        self.assertTrue(np.array_equal(sa, ctrl))

        conn = read.connect_sql(conn_str, allow_caching=True)
        sa = conn.execute('SELECT * FROM employees')
        self.assertTrue(np.array_equal(sa, ctrl))
        sa = conn.execute('SELECT id FROM employees')
        ctrl2 = np.array([(1,), (2,), (3,)], dtype=[('id', '<i8')])
        self.assertTrue(np.array_equal(sa, ctrl2))
        sa = conn.execute('SELECT * FROM employees')
        self.assertTrue(np.array_equal(sa, ctrl))
Пример #15
0
 def test_subset_over_label_windows(self):
     db_file = uft.path_of_data('rg_label_windows.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'label_windows')
     ae = ae.set_default_aggregation('SUM')
     exp = ae.subset_over(label_col='inspection',
                          interval_train_window_start=2000,
                          interval_train_window_end=2001,
                          interval_test_window_start=2002,
                          interval_test_window_end=2003,
                          interval_inc_value=1,
                          label_col_aggr_of_interest='SUM',
                          interval_expanding=False,
                          label_interval_train_window_start=2007,
                          label_interval_train_window_end=2007,
                          label_interval_test_window_start=2009,
                          label_interval_test_window_end=2009,
                          label_interval_inc_value=1,
                          label_interval_expanding=False)
     exp.make_csv('label_window.csv')
Пример #16
0
    def test_feature_gen_lambda(self):

        def feature_gen(M, test_or_train, interval_start, interval_end, 
                        row_M_start, row_M_end):
            return append_cols(M, M['relevent_feature'] * 2 if test_or_train == 'test' 
                               else M['relevent_feature'] * 3, 'mult')
        #TODO
        db_file = uft.path_of_data('rg_subset_over.db')
        conn_str = 'sqlite:///{}'.format(db_file)
        ae = array_emitter.ArrayEmitter()
        ae = ae.get_rg_from_sql(conn_str, 'subset_over')
        ae = ae.set_default_aggregation('SUM')
        exp = ae.subset_over(
            label_col='label',
            interval_train_window_start=2004,
            interval_train_window_size=1,
            interval_test_window_start=2006,
            interval_test_window_size=1,
            interval_inc_value=1,
            interval_expanding=False,
            row_M_col_name='cohort',
            row_M_train_window_start=2008,
            row_M_train_window_size=0,
            row_M_test_window_start=2009,
            row_M_test_window_size=0,
            row_M_inc_value=1,
            row_M_expanding=False,
            clfs=DBG_std_clfs,
            feature_gen_lambda=feature_gen)
        for run in it.chain.from_iterable([trial.runs_flattened() for trial in exp.trials]):
            relevent_idx = run.col_names.index('relevent_feature')
            mult_idx = run.col_names.index('mult')
            self.assertTrue(
                    np.allclose(run.M[:,relevent_idx] * 3, run.M[:,mult_idx]))
            self.assertTrue(
                    np.allclose(run.M_test[:,relevent_idx] * 2, 
                                run.M_test[:,mult_idx]))
Пример #17
0
 def test_subset_over(self):
     db_file = uft.path_of_data('rg_subset_over.db')
     conn_str = 'sqlite:///{}'.format(db_file)
     ae = array_emitter.ArrayEmitter()
     ae = ae.get_rg_from_sql(conn_str, 'subset_over')
     ae = ae.set_default_aggregation('SUM')
     exp = ae.subset_over(
         label_col='label',
         interval_train_window_start=2004,
         interval_train_window_size=1,
         interval_test_window_start=2006,
         interval_test_window_size=1,
         interval_inc_value=1,
         interval_expanding=False,
         row_M_col_name='cohort',
         row_M_train_window_start=2008,
         row_M_train_window_size=0,
         row_M_test_window_start=2009,
         row_M_test_window_size=0,
         row_M_inc_value=1,
         row_M_expanding=False,
         clfs=DBG_std_clfs)
     exp.make_report(verbose=False)
     exp.make_csv()
Пример #18
0
 def __pkl_store(self, obj, key):
     with open(uft.path_of_data(key + '.pkl'), 'w') as pkl:
         cPickle.dump(obj, pkl)
Пример #19
0
 def test_open_csv_as_list(self):
     csv_file = utils_for_tests.path_of_data("mixed.csv")
     correct = [[0, 'Jim', 5.6], [1, 'Jill', 5.5]]
     self.assertEqual(open_csv_as_list(csv_file),correct)
Пример #20
0
import cPickle
import os

import numpy as np

from sklearn.svm import SVC
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold

import eights.perambulate as per
import eights.communicate as comm

import utils_for_tests as uft

REPORT_PATH = uft.path_of_data('test_perambulate.pdf')
REFERENCE_REPORT_PATH = uft.path_of_data('test_perambulate_ref.pdf')
REFERENCE_PKL_PATH = uft.path_of_data('test_perambulate')


class TestPerambulate(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.report = comm.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box('Test Perambulate visual regression tests', [
            'graphical output available at:', report_path,
            'Reference available at:', REFERENCE_REPORT_PATH
Пример #21
0
 def __get_ref_pkl(self, key):
     with open(uft.path_of_data(key + '.pkl')) as pkl:
         return cPickle.load(pkl)
Пример #22
0
 def __pkl_store(self, obj, key):
     with open(uft.path_of_data(key + '.pkl'), 'w') as pkl:
         cPickle.dump(obj, pkl)
Пример #23
0
 def __get_ref_pkl(self, key):
     with open(uft.path_of_data(key + '.pkl')) as pkl:
         return cPickle.load(pkl)
Пример #24
0
import unittest
from datetime import datetime
from collections import Counter
import eights.communicate as comm
from eights.communicate.communicate import feature_pairs_in_tree
from eights.communicate.communicate import feature_pairs_in_rf
from eights import utils
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import utils_for_tests as uft
import numpy as np
import matplotlib.pyplot as plt

REPORT_PATH=uft.path_of_data('test_communicate.pdf')
SUBREPORT_PATH=uft.path_of_data('test_communicate_sub.pdf')
REFERENCE_REPORT_PATH=uft.path_of_data('test_communicate_ref.pdf')

class TestCommunicate(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.report = comm.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box(
            'Test communicate visual regression tests',
            ['graphical output available at:',
             report_path,
Пример #25
0
 def test_open_csv(self):
     csv_file = utils_for_tests.path_of_data("mixed.csv")
     correct = np.array([(0, 'Jim', 5.6), (1, 'Jill', 5.5)],
                        dtype=[('id', '<i8'), ('name', 'O'),
                               ('height', '<f8')])
     self.assertTrue(np.array_equal(read.open_csv(csv_file), correct))
Пример #26
0
import cPickle
import os

import numpy as np

from sklearn.svm import SVC 
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold

import diogenes.grid_search as per
import diogenes.display as dsp

import utils_for_tests as uft

REPORT_PATH = uft.path_of_data('test_grid_search.pdf')
REFERENCE_REPORT_PATH = uft.path_of_data('test_grid_search_ref.pdf')
REFERENCE_PKL_PATH = uft.path_of_data('test_grid_search')

class TestGridSearch(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.report = dsp.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box(
                'Test Perambulate visual regression tests',
                ['graphical output available at:',
Пример #27
0
 def test_open_csv_as_structured_array(self):
     csv_file = utils_for_tests.path_of_data("mixed.csv")
     correct = np.array([(0, 'Jim', 5.6), (1, 'Jill', 5.5)],dtype=[('id', '<i8'), ('name', 'S4'), ('height', '<f8')])
     self.assertTrue(np.array_equal(open_csv_as_structured_array(csv_file),correct))
Пример #28
0
import unittest
from datetime import datetime
from collections import Counter
import eights.communicate as comm
from eights.communicate.communicate import feature_pairs_in_tree
from eights.communicate.communicate import feature_pairs_in_rf
from eights import utils
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import utils_for_tests as uft
import numpy as np
import matplotlib.pyplot as plt

REPORT_PATH = uft.path_of_data('test_communicate.pdf')
SUBREPORT_PATH = uft.path_of_data('test_communicate_sub.pdf')
REFERENCE_REPORT_PATH = uft.path_of_data('test_communicate_ref.pdf')


class TestCommunicate(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.report = comm.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box('Test communicate visual regression tests', [
            'graphical output available at:', report_path,
            'Reference available at:', REFERENCE_REPORT_PATH
Пример #29
0
from diogenes.display.display import feature_pairs_in_tree
from diogenes.display.display import feature_pairs_in_rf
from diogenes.display.display import table
from diogenes.display.display import crosstab
from diogenes.display.display import describe_cols
from diogenes import utils
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import utils_for_tests as uft
import numpy as np
import matplotlib.pyplot as plt


REPORT_PATH=uft.path_of_data('test_display.pdf')
SUBREPORT_PATH=uft.path_of_data('test_display_sub.pdf')
REFERENCE_REPORT_PATH=uft.path_of_data('test_display_ref.pdf')

class TestDisplay(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.report = dsp.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box(
            'Test display visual regression tests',
            ['graphical output available at:',
             report_path,
Пример #30
0
import cPickle
import os

import numpy as np

from sklearn.svm import SVC 
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold

import eights.perambulate as per
import eights.communicate as comm

import utils_for_tests as uft

REPORT_PATH = uft.path_of_data('test_perambulate.pdf')
REFERENCE_REPORT_PATH = uft.path_of_data('test_perambulate_ref.pdf')
REFERENCE_PKL_PATH = uft.path_of_data('test_perambulate')

class TestPerambulate(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        cls.report = comm.Report(report_path=REPORT_PATH)

    @classmethod
    def tearDownClass(cls):
        report_path = cls.report.to_pdf(verbose=False)
        uft.print_in_box(
                'Test Perambulate visual regression tests',
                ['graphical output available at:',