def test_flag_null(self): """Detect an column that is mostly null except for a small percent of rows """ col = pd.Series([None] * 18 + [1] * 2) (cts, disquals) = ops.col_type(col) self.assertHasColType('flag null', cts, disquals)
def test_num_accounting(self): """Detect a numeric value that looks like accounting totals """ col = pd.Series([0,0,0,100,110, 1000, 1_000_000, 50_000, 0, 7000]) (cts, disquals) = ops.col_type(col) self.assertHasColType('num accounting', cts, disquals)
def test_date_reg(self): """Detect a date field with regularly spaced values """ col = pd.Series([datetime.date(2020, i, 1) \ for i in range(1, 7)]) (cts, disquals) = ops.col_type(col) self.assertHasColType(ops.DateRegularColumn.label, cts, disquals)
def test_text(self): """Detect a column with free form text """ col = pd.Series([ 'this is a comment', None, 'this is a comment', 'this is a much longer comment that contains more words', '']) cts = ops.col_type(col) self.assertColType(cts, ['text'])
def test_date_irreg(self): """Detect a date field with irregularly spaced values """ col = pd.Series([ datetime.date(2020, 1, 15), datetime.date(2020, 1, 27), datetime.date(2020, 2, 1), datetime.date(2020, 9, 30), datetime.date(2020, 10, 31), ]) (cts, disquals) = ops.col_type(col) self.assertNotColType(ops.DateRegularColumn.label, cts, disquals)
def test_flag(self): """Detect a column with two values, a common and a rare """ col = pd.Series(['Y'] * 17 + ['N'] * 3) (cts, disquals) = ops.col_type(col) self.assertHasColType('flag', cts, disquals)
def test_num_long_tail(self): """Detect an numeric column with long tail distribution """ col = pd.Series([1,1,1,1,1,2,2,2,2,3,3,3,4,4,5,5,6,6,8,10]) (cts, disquals) = ops.col_type(col) self.assertHasColType('num long tail', cts, disquals)
def test_num_normal(self): """Detect an normally distributed numeric column """ col = pd.Series([1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,7,7,8,11]) (cts, disquals) = ops.col_type(col) self.assertHasColType('num normal', cts, disquals)
def test_categorical_num(self): """Detect a categorical field with numerical values """ col = pd.Series([10] * 3 + [20]) (cts, disquals) = ops.col_type(col) self.assertHasColType('categorical', cts, disquals)
def test_categorical_alpha(self): """Detect a categorical field with string values """ col = pd.Series(['cat'] * 3 + ['dog']) (cts, disquals) = ops.col_type(col) self.assertOnlyColType('categorical', cts, disquals)
def test_id(self): """Detect an ID column """ col = pd.Series([1,2,3,4,5]) (cts, disquals) = ops.col_type(col) self.assertHasColType('id', cts, disquals)