示例#1
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            IntegerType(),
            StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
示例#2
0
 def test_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [StringType(), StringType(), DecimalType()])
示例#3
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)
示例#4
0
 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])
示例#5
0
 def test_read_type_know_simple(self):
     fh = horror_fobj('simple.xls')
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     row = list(row_set.sample)[1]
     types = [c.type for c in row]
     assert_equal(types, [DateType(None), FloatType(), StringType()])
示例#6
0
 def test_read_simple_tsv(self):
     fh = horror_fobj('example.tsv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(141, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[0].value, 'hour')
     assert_equal(row[1].value, 'expr1_0_imp')
     for row in list(row_set):
         assert_equal(17, len(row))
         assert_equal(row[0].type, StringType())
示例#7
0
 def test_non_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2.1
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [IntegerType(),
                   StringType(), DecimalType()])
示例#8
0
    def test_read_simple_csv(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(7, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'temperature')

        for row in list(row_set):
            assert_equal(3, len(row))
            assert_equal(row[0].type, StringType())
示例#9
0
    def test_read_simple_psv(self):
        # pipe/vertical bar ("|") separated values
        fh = horror_fobj('simple.psv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(29, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'Year')
        assert_equal(row[1].value, 'New dwellings')

        for row in list(row_set):
            assert_equal(6, len(row))
            assert_equal(row[0].type, StringType())
示例#10
0
    def test_read_complex_csv(self):
        fh = horror_fobj('complex.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(4, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'another date')
        assert_equal(row[2].value, 'temperature')
        assert_equal(row[3].value, 'place')

        for row in list(row_set):
            assert_equal(4, len(row))
            assert_equal(row[0].type, StringType())
示例#11
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
示例#12
0
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(), IntegerType(),
         IntegerType(), IntegerType(), IntegerType(), IntegerType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), IntegerType(), StringType(),
         StringType()])
head_join = "\n".join(head_str)
f = io.BytesIO(bytes(head_join, 'utf-8'))

# gussing
row_set = CSVTableSet(f, delimiter=delimiter).tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample)
#types = type_guess(row_set.sample, strict=True)
print('guessed types:', types)

# constructing ddl
cols = []
for indx, typ in enumerate(types):
    if typ == StringType():
        cols.append("  `a%s` string" % (indx))
    elif typ == DateType(date_format):
        cols.append("  `a%s` date" % (indx))
    elif typ == DecimalType():
        cols.append("  `a%s` double" % (indx))
    elif typ == IntegerType():
        cols.append("  `a%s` int" % (indx))
    elif typ == BoolType():
        cols.append("  `a%s` boolean" % (indx))
    else:
        raise Exception("A type of column %indx cannot be handled. %s " %
                        (indx, typ))
cols_str = ",\n".join(cols)

ddl = '''----------------------------------------------------------