def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), BoolType(), BoolType(), BoolType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), BoolType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType(): cols.append(" `a%s` double" % (indx)) elif typ == IntegerType(): cols.append(" `a%s` int" % (indx)) elif typ == BoolType(): cols.append(" `a%s` boolean" % (indx)) else: raise Exception("A type of column %indx cannot be handled. %s " % (indx, typ)) cols_str = ",\n".join(cols) ddl = '''---------------------------------------------------------- CREATE EXTERNAL TABLE IF NOT EXISTS default.%s ( %s ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "%s", "quoteChar" = "'", "escapeChar" = "\\\\"