def test_replace_missing_vals(self): M = np.array([('a', 0, 0.0, 0.1), ('b', 1, 1.0, np.nan), ('', -999, np.nan, 0.0), ('d', 1, np.nan, 0.2), ('', -999, 2.0, np.nan)], dtype=[('str', 'O'), ('int', int), ('float1', float), ('float2', float)]) ctrl = M.copy() ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0]) ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0]) res = replace_missing_vals(M, 'constant', constant=-1.0) self.assertTrue(np.array_equal(ctrl, res)) ctrl = M.copy() ctrl['int'] = np.array([100, 1, -999, 1, -999]) ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0]) ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan]) res = replace_missing_vals(M, 'constant', missing_val=0, constant=100) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) ctrl = M.copy() ctrl['int'] = np.array([0, 1, 1, 1, 1]) res = replace_missing_vals(M, 'most_frequent', missing_val=-999) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) ctrl = M.copy() ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0]) ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1]) res = replace_missing_vals(M, 'mean', missing_val=np.nan) self.assertTrue(utils_for_tests.array_equal(ctrl, res))
def proc_array(M, labels, test_or_train, interval_start, interval_end, label_interval_start, label_interval_end, row_M_start, row_M_end): nonnan = np.logical_not(np.isnan(labels)) M = M[nonnan] M = replace_missing_vals(M, 'constant', constant=0) labels = labels[nonnan] return (M, labels)
return 'SubsetSchool({})'.format(grades) DATA_PATH = '/home/zar1/hs-scratch/' fin = open(os.path.join(DATA_PATH, 'data_rec_array.pkl')) print 'loading data' M = cPickle.load(fin) fin.close() print 'data loaded' y = M['label'] M = remove_cols(M, ['label', 'student_id', 'index']) print 'set up data' M = replace_missing_vals(M, 'constant', np.nan) print 'imputed' min_year = min(M['cohort']) clfs = [{'clf': RandomForestClassifier, 'random_state': [0]}] csvs = [] train_start = min_year train_window_size = 2 init_train_window_end = train_start + train_window_size - 1 for max_grade in xrange(9, 12): print 'making experiment' print max_grade test_start = init_train_window_end + (12 - max_grade) subsets = [{'subset': SubsetSchool, 'max_grades': [[max_grade]]}]