def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), IntegerType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def test_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [StringType(), StringType(), DecimalType()])
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
def test_read_type_know_simple(self): fh = horror_fobj('simple.xls') table_set = XLSTableSet(fh) row_set = table_set.tables[0] row = list(row_set.sample)[1] types = [c.type for c in row] assert_equal(types, [DateType(None), FloatType(), StringType()])
def test_read_simple_tsv(self): fh = horror_fobj('example.tsv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(141, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'hour') assert_equal(row[1].value, 'expr1_0_imp') for row in list(row_set): assert_equal(17, len(row)) assert_equal(row[0].type, StringType())
def test_non_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' 1, , 2.1 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_read_simple_csv(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(7, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'temperature') for row in list(row_set): assert_equal(3, len(row)) assert_equal(row[0].type, StringType())
def test_read_simple_psv(self): # pipe/vertical bar ("|") separated values fh = horror_fobj('simple.psv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(29, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'Year') assert_equal(row[1].value, 'New dwellings') for row in list(row_set): assert_equal(6, len(row)) assert_equal(row[0].type, StringType())
def test_read_complex_csv(self): fh = horror_fobj('complex.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(4, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'another date') assert_equal(row[2].value, 'temperature') assert_equal(row[3].value, 'place') for row in list(row_set): assert_equal(4, len(row)) assert_equal(row[0].type, StringType())
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType()])
head_join = "\n".join(head_str) f = io.BytesIO(bytes(head_join, 'utf-8')) # gussing row_set = CSVTableSet(f, delimiter=delimiter).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType(): cols.append(" `a%s` double" % (indx)) elif typ == IntegerType(): cols.append(" `a%s` int" % (indx)) elif typ == BoolType(): cols.append(" `a%s` boolean" % (indx)) else: raise Exception("A type of column %indx cannot be handled. %s " % (indx, typ)) cols_str = ",\n".join(cols) ddl = '''----------------------------------------------------------