예제 #1
0
    def test_replace_missing_vals(self):
        M = np.array([('a', 0, 0.0, 0.1),
                      ('b', 1, 1.0, np.nan),
                      ('', -999, np.nan, 0.0),
                      ('d', 1, np.nan, 0.2),
                      ('', -999, 2.0, np.nan)],
                     dtype=[('str', 'O'), ('int', int), ('float1', float),
                            ('float2', float)])

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0])
        ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0])
        res = replace_missing_vals(M, 'constant', constant=-1.0)
        self.assertTrue(np.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([100, 1, -999, 1, -999])
        ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0])
        ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan])
        res = replace_missing_vals(M, 'constant', missing_val=0, constant=100)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([0, 1, 1, 1, 1])
        res = replace_missing_vals(M, 'most_frequent', missing_val=-999)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0])
        ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1])
        res = replace_missing_vals(M, 'mean', missing_val=np.nan)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
예제 #2
0
    def test_replace_missing_vals(self):
        M = np.array([('a', 0, 0.0, 0.1),
                      ('b', 1, 1.0, np.nan),
                      ('', -999, np.nan, 0.0),
                      ('d', 1, np.nan, 0.2),
                      ('', -999, 2.0, np.nan)],
                     dtype=[('str', 'O'), ('int', int), ('float1', float),
                            ('float2', float)])

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0])
        ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0])
        res = replace_missing_vals(M, 'constant', constant=-1.0)
        self.assertTrue(np.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([100, 1, -999, 1, -999])
        ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0])
        ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan])
        res = replace_missing_vals(M, 'constant', missing_val=0, constant=100)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['int'] = np.array([0, 1, 1, 1, 1])
        res = replace_missing_vals(M, 'most_frequent', missing_val=-999)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))

        ctrl = M.copy()
        ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0])
        ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1])
        res = replace_missing_vals(M, 'mean', missing_val=np.nan)
        self.assertTrue(utils_for_tests.array_equal(ctrl, res))
예제 #3
0
def proc_array(M, labels, test_or_train, interval_start, interval_end, 
               label_interval_start, label_interval_end,
               row_M_start, row_M_end):
    nonnan = np.logical_not(np.isnan(labels))
    M = M[nonnan]
    M = replace_missing_vals(M, 'constant', constant=0)
    labels = labels[nonnan]
    return (M, labels)
예제 #4
0
def proc_array(M, labels, test_or_train, interval_start, interval_end,
               label_interval_start, label_interval_end, row_M_start,
               row_M_end):
    nonnan = np.logical_not(np.isnan(labels))
    M = M[nonnan]
    M = replace_missing_vals(M, 'constant', constant=0)
    labels = labels[nonnan]
    return (M, labels)
예제 #5
0
        return 'SubsetSchool({})'.format(grades)

DATA_PATH = '/home/zar1/hs-scratch/'

fin = open(os.path.join(DATA_PATH, 'data_rec_array.pkl'))
print 'loading data'
M = cPickle.load(fin)
fin.close()
print 'data loaded'

y = M['label']
M = remove_cols(M, ['label', 'student_id', 'index'])

print 'set up data'

M = replace_missing_vals(M, 'constant', np.nan)
print 'imputed'


min_year = min(M['cohort'])

clfs = [{'clf': RandomForestClassifier, 'random_state': [0]}]
csvs = []
train_start = min_year
train_window_size = 2
init_train_window_end = train_start + train_window_size - 1
for max_grade in xrange(9, 12):
    print 'making experiment'
    print max_grade
    test_start = init_train_window_end + (12 - max_grade)
    subsets = [{'subset': SubsetSchool, 'max_grades': [[max_grade]]}]