示例#1
0
def test_filereader_book1xlsx():
    path = Path(__file__).parent / "data" / 'book1.xlsx'
    assert path.exists()
    start = process_time_ns()
    tables = list(file_reader(path))
    end = process_time_ns()

    fields = sum(len(t) * len(t.columns) for t in tables)
    print("{:,} fields/seccond".format(
        round(1e9 * fields / max(1, end - start), 0)))

    sheet1 = Table(filename=path.name, sheet_name='Sheet1')
    for column_name in list('abcdef'):
        sheet1.add_column(column_name, int, False)

    sheet2 = Table(
        filename=path.name,
        sheet_name='Sheet2 ')  # there's a deliberate white space at the end!
    sheet2.add_column('a', int, False)
    for column_name in list('bcdef'):
        sheet2.add_column(column_name, float, False)

    books = [sheet1, sheet2]

    for book, table in zip(books, tables):
        table.show(slice(5))
        assert table.compare(book)
        assert len(table) == 45, len(table)
示例#2
0
def test_filereader_gdocs1csv_no_header():
    path = Path(__file__).parent / "data" / 'gdocs1.csv'
    assert path.exists()
    table = list(file_reader(path, has_headers=False))[0]
    table.show(slice(0, 10))

    book1_csv = Table(filename=path.name)
    for idx, _ in enumerate(list('abcdef'), 1):
        book1_csv.add_column(f"_{idx}", str)

    assert table.compare(book1_csv), table.compare(book1_csv)
    assert len(table) == 46
示例#3
0
def test_filereader_gdocs1xlsx():
    path = Path(__file__).parent / "data" / 'gdocs1.xlsx'
    assert path.exists()
    table = list(file_reader(path))[0]
    table.show(slice(0, 10))

    gdocs1xlsx = Table(filename=path.name, sheet_name='Sheet1')
    for float_type in list('abcdef'):
        gdocs1xlsx.add_column(float_type, int)

    assert table.compare(gdocs1xlsx), table.compare(gdocs1xlsx)
    assert len(table) == 45
示例#4
0
def test_filereader_gdocs1xlsx_no_header():
    path = Path(__file__).parent / "data" / 'gdocs1.xlsx'
    assert path.exists()
    table = list(file_reader(path, has_headers=False))[0]
    table.show(slice(0, 10))

    gdocs1xlsx = Table(filename=path.name, sheet_name='Sheet1')
    for idx, _ in enumerate(list('abcdef'), 1):
        gdocs1xlsx.add_column(f"_{idx}", str)

    assert table.compare(gdocs1xlsx), table.compare(gdocs1xlsx)
    assert len(table) == 46
示例#5
0
def test_filereader_exceldatesxlsx():
    path = Path(__file__).parent / "data" / 'excel_dates.xlsx'
    assert path.exists()
    table = list(file_reader(path))[0]
    table.show()

    sheet1 = Table(filename=path.name, sheet_name='Sheet1')
    sheet1.add_column('Date', datetime, False)
    sheet1.add_column('numeric value', int, False)
    sheet1.add_column('string', date, False)
    sheet1.add_column('bool', bool, False)

    assert table.compare(sheet1)
    assert len(table) == 2, len(table)
示例#6
0
def test_filereader_gdocsc1ods():
    path = Path(__file__).parent / "data" / 'gdocs1.ods'
    assert path.exists()
    tables = file_reader(path)

    sheet1 = Table(filename=path.name, sheet_name='Sheet1')
    for int_type in list('abcdef'):
        sheet1.add_column(int_type, int)

    sheet2 = Table(filename=path.name, sheet_name='Sheet2')
    sheet2.add_column('a', int)
    for float_type in list('bcdef'):
        sheet2.add_column(float_type, float)

    sheets = [sheet1, sheet2]

    for sheet, table in zip(sheets, tables):
        table.compare(sheet)
        assert len(table) == 45, table.show(blanks="")
示例#7
0
def test_groupby_02():
    t = Table()
    t.add_column('A', int, data=[1, 1, 2, 2, 3, 3] * 2)
    t.add_column('B', int, data=[1, 2, 3, 4, 5, 6] * 2)
    t.add_column('C', int, data=[6, 5, 4, 3, 2, 1] * 2)

    t.show()
    # +=====+=====+=====+
    # |  A  |  B  |  C  |
    # | int | int | int |
    # |False|False|False|
    # +-----+-----+-----+
    # |    1|    1|    6|
    # |    1|    2|    5|
    # |    2|    3|    4|
    # |    2|    4|    3|
    # |    3|    5|    2|
    # |    3|    6|    1|
    # |    1|    1|    6|
    # |    1|    2|    5|
    # |    2|    3|    4|
    # |    2|    4|    3|
    # |    3|    5|    2|
    # |    3|    6|    1|
    # +=====+=====+=====+

    g = t.groupby(keys=['A', 'C'], functions=[('B', gb.sum)])
    g.table.show()
    t2 = g.pivot('A')

    t2.show()
    # +=====+==========+==========+==========+
    # |  C  |Sum(B,A=1)|Sum(B,A=2)|Sum(B,A=3)|
    # | int |   int    |   int    |   int    |
    # |False|   True   |   True   |   True   |
    # +-----+----------+----------+----------+
    # |    5|         4|      None|      None|
    # |    6|         2|      None|      None|
    # |    3|      None|         8|      None|
    # |    4|      None|         6|      None|
    # |    1|      None|      None|        12|
    # |    2|      None|      None|        10|
    # +=====+==========+==========+==========+

    assert len(t2) == 6 and len(t2.columns) == 4
示例#8
0
def test_lookup_functions():  # doing lookups is supported by indexing:
    table6 = Table()
    table6.add_column(
        'A',
        str,
        data=['Alice', 'Bob', 'Bob', 'Ben', 'Charlie', 'Ben', 'Albert'])
    table6.add_column('B',
                      str,
                      data=[
                          'Alison', 'Marley', 'Dylan', 'Affleck', 'Hepburn',
                          'Barnes', 'Einstein'
                      ])

    index = table6.index('A')  # single key.
    assert index[('Bob', )] == {1, 2}

    index2 = table6.index('A', 'B')  # multiple keys.
    assert index2[('Bob', 'Dylan')] == {2}
示例#9
0
def test_filereader_utf8sig_encoding_csv():
    path = Path(__file__).parent / "data" / 'utf8sig.csv'
    assert path.exists()
    table = list(file_reader(path, sep=','))[0]
    table.show(slice(0, 10))
    table.show(slice(-15))

    book1_csv = Table(filename=path.name)
    book1_csv.add_column('432', int)
    book1_csv.add_column('1', int)
    assert table.compare(book1_csv), table.compare(book1_csv)
    assert len(table) == 2, len(table)
示例#10
0
def test_filereader_gdocsc1ods_no_header():
    path = Path(__file__).parent / "data" / 'gdocs1.ods'
    assert path.exists()
    tables = file_reader(path, has_headers=False)

    sheet1 = Table(filename=path.name, sheet_name='Sheet1')
    for idx, _ in enumerate(list('abcdef'), 1):
        sheet1.add_column(f"_{idx}", str)

    sheet2 = Table(filename=path.name, sheet_name='Sheet2')
    for idx, _ in enumerate(list('abcdef'), 1):
        sheet2.add_column(f"_{idx}", str)

    sheets = [sheet1, sheet2]

    for sheet, table in zip(sheets, tables):
        table.show(slice(0, 10))
        table.compare(sheet)
        assert len(table) == 46, table.show(blanks="")
示例#11
0
def test_filereader_book1tsv():
    path = Path(__file__).parent / "data" / 'book1.tsv'
    assert path.exists()
    table = list(file_reader(path))[0]
    table.show(slice(0, 10))

    book1 = Table(filename=path.name)
    book1.add_column('a', int)
    for float_type in list('bcdef'):
        book1.add_column(float_type, float)

    assert table.compare(book1), table.compare(book1)
    assert len(table) == 45
示例#12
0
def test_ttopi():
    """ example code from the readme as "reversing a pivot tablite". """
    from random import seed, choice
    seed(11)

    records = 9
    t = Table()
    t.add_column('record id',
                 int,
                 allow_empty=False,
                 data=[i for i in range(records)])
    for column in [f"4.{i}.a" for i in range(5)]:
        t.add_column(
            column,
            str,
            allow_empty=True,
            data=[choice(['a', 'h', 'e', None]) for i in range(records)])

    print("\nshowing raw data:")
    t.show()
    # +=====+=====+
    # |  A  |  B  |
    # | str | str |
    # |False|False|
    # +-----+-----+
    # |4.2.a|e    |
    # |4.3.a|h    |
    # |4.2.a|h    |
    # |4.2.a|e    |
    # |4.3.a|e    |
    # |4.3.a|e    |
    # |4.1.a|e    |
    # |4.1.a|a    |
    # |4.3.a|e    |
    # |4.2.a|a    |
    # |4.3.a|e    |
    # |4.3.a|a    |
    # |4.1.a|a    |
    # |4.1.a|a    |
    # |4.2.a|a    |
    # |4.2.a|a    |
    # |4.1.a|e    |
    # |4.1.a|a    |
    # |4.3.a|h    |
    # |4.3.a|h    |
    # |4.3.a|h    |
    # |4.1.a|e    |
    # +=====+=====+

    # wanted output:
    # +=====+=====
    # |  A  | Count(A,B=a) | Count(A,B=h) | Count(A,B=e) |
    # | str |     int      |     int      |     int      |
    # |False|    False     |    False     |    False     |
    # +-----+--------------+--------------+--------------+
    # |4.1.a|            3 |            0 |            3 |
    # |4.2.a|            3 |            1 |            2 |
    # |4.3.a|            1 |            4 |            4 |
    # +=====+==============+==============+==============+

    reverse_pivot = Table()
    records = t['record id']
    reverse_pivot.add_column('record id', records.datatype, allow_empty=False)
    reverse_pivot.add_column('4.x', str, allow_empty=False)
    reverse_pivot.add_column('ahe', str, allow_empty=True)

    for name in t.columns:
        if not name.startswith('4.'):
            continue
        column = t[name]
        for index, entry in enumerate(column):
            new_row = records[index], name, entry  # record id, 4.x, ahe
            reverse_pivot.add_row(new_row)

    print("\nshowing reversed pivot of the raw data:")
    reverse_pivot.show()

    g = reverse_pivot.groupby(['4.x', 'ahe'],
                              functions=[('ahe', GroupBy.count)])
    print("\nshowing basic groupby of the reversed pivot")
    g.table.show()
    t2 = g.pivot('ahe')
    print("\nshowing the wanted output:")
    t2.show()
示例#13
0
def test_lookup():
    friends = Table()
    friends.add_column(
        "name",
        str,
        data=['Alice', 'Betty', 'Charlie', 'Dorethy', 'Edward', 'Fred'])
    friends.add_column("stop",
                       str,
                       data=[
                           'Downtown-1', 'Downtown-2', 'Hillside View',
                           'Hillside Crescent', 'Downtown-2', 'Chicago'
                       ])
    friends.show()

    random.seed(11)
    table_size = 40

    times = [
        DataTypes.time(random.randint(21, 23), random.randint(0, 59))
        for i in range(table_size)
    ]
    stops = [
        'Stadium', 'Hillside', 'Hillside View', 'Hillside Crescent',
        'Downtown-1', 'Downtown-2', 'Central station'
    ] * 2 + [f'Random Road-{i}' for i in range(table_size)]
    route = [random.choice([1, 2, 3]) for i in stops]

    bustable = Table()
    bustable.add_column("time", DataTypes.time, data=times)
    bustable.add_column("stop", str, data=stops[:table_size])
    bustable.add_column("route", int, data=route[:table_size])

    bustable.sort(**{'time': False})

    print("Departures from Concert Hall towards ...")
    bustable[:10].show()

    lookup_1 = friends.lookup(bustable, ('time', ">=", DataTypes.time(21, 10)),
                              ('stop', "==", 'stop'))
    lookup_1.sort(**{'time': False})
    lookup_1.show()

    expected = [
        ('Fred', 'Chicago', None, None, None),
        ('Dorethy', 'Hillside Crescent', time(21, 5), 'Hillside Crescent', 2),
        ('Betty', 'Downtown-2', time(21, 51), 'Downtown-2', 1),
        ('Edward', 'Downtown-2', time(21, 51), 'Downtown-2', 1),
        ('Charlie', 'Hillside View', time(22, 19), 'Hillside View', 2),
        ('Alice', 'Downtown-1', time(23, 12), 'Downtown-1', 3),
    ]

    for row in lookup_1.rows:
        expected.remove(row)
    assert expected == []
示例#14
0
def test_clipboard():
    t = Table()
    t.add_column('dog', int, data=[1, 4])
    t.add_column('cat', int, data=[2, 5])
    t.add_column('hat', int, data=[3, 6])
    t.show()
    try:
        t.copy_to_clipboard()
        t2 = t.copy_from_clipboard()
        assert t == t2
    except pyperclip.PyperclipException:
        pass  # travis VMs can't handle this.
示例#15
0
def test_sortation():  # Sortation

    table7 = Table()
    table7.add_column('A',
                      int,
                      data=[1, None, 8, 3, 4, 6, 5, 7, 9],
                      allow_empty=True)
    table7.add_column('B', int, data=[10, 100, 1, 1, 1, 1, 10, 10, 10])
    table7.add_column('C', int, data=[0, 1, 0, 1, 0, 1, 0, 1, 0])

    assert not table7.is_sorted()

    sort_order = {'B': False, 'C': False, 'A': False}

    table7.sort(**sort_order)

    assert list(table7.rows) == [(4, 1, 0), (8, 1, 0), (3, 1, 1), (6, 1, 1),
                                 (1, 10, 0), (5, 10, 0), (9, 10, 0),
                                 (7, 10, 1), (None, 100, 1)]

    assert list(table7.filter('A', 'B', slice(4, 8))) == [(1, 10), (5, 10),
                                                          (9, 10), (7, 10)]

    assert table7.is_sorted(**sort_order)
示例#16
0
def test_sql_joins():  # a couple of examples with SQL join:

    numbers = Table()
    numbers.add_column('number',
                       int,
                       allow_empty=True,
                       data=[1, 2, 3, 4, None])
    numbers.add_column('colour',
                       str,
                       data=['black', 'blue', 'white', 'white', 'blue'])

    letters = Table()
    letters.add_column('letter',
                       str,
                       allow_empty=True,
                       data=['a', 'b', 'c', 'd', None])
    letters.add_column('color',
                       str,
                       data=['blue', 'white', 'orange', 'white', 'blue'])

    # left join
    # SELECT number, letter FROM numbers LEFT JOIN letters ON numbers.colour == letters.color
    left_join = numbers.left_join(letters,
                                  left_keys=['colour'],
                                  right_keys=['color'],
                                  columns=['number', 'letter'])
    left_join.show()
    # +======+======+
    # |number|letter|
    # | int  | str  |
    # | True | True |
    # +------+------+
    # |     1|None  |
    # |     2|a     |
    # |     2|None  |
    # |     3|b     |
    # |     3|d     |
    # |     4|b     |
    # |     4|d     |
    # |None  |a     |
    # |None  |None  |
    # +======+======+
    assert [i
            for i in left_join['number']] == [1, 2, 2, 3, 3, 4, 4, None, None]
    assert [i for i in left_join['letter']
            ] == [None, 'a', None, 'b', 'd', 'b', 'd', 'a', None]

    # inner join
    # SELECT number, letter FROM numbers JOIN letters ON numbers.colour == letters.color
    inner_join = numbers.inner_join(letters,
                                    left_keys=['colour'],
                                    right_keys=['color'],
                                    columns=['number', 'letter'])
    inner_join.show()
    # +======+======+
    # |number|letter|
    # | int  | str  |
    # | True | True |
    # +------+------+
    # |     2|a     |
    # |     2|None  |
    # |None  |a     |
    # |None  |None  |
    # |     3|b     |
    # |     3|d     |
    # |     4|b     |
    # |     4|d     |
    # +======+======+
    assert [i for i in inner_join['number']] == [2, 2, None, None, 3, 3, 4, 4]
    assert [i for i in inner_join['letter']
            ] == ['a', None, 'a', None, 'b', 'd', 'b', 'd']

    # outer join
    # SELECT number, letter FROM numbers OUTER JOIN letters ON numbers.colour == letters.color
    outer_join = numbers.outer_join(letters,
                                    left_keys=['colour'],
                                    right_keys=['color'],
                                    columns=['number', 'letter'])
    outer_join.show()
    # +======+======+
    # |number|letter|
    # | int  | str  |
    # | True | True |
    # +------+------+
    # |     1|None  |
    # |     2|a     |
    # |     2|None  |
    # |     3|b     |
    # |     3|d     |
    # |     4|b     |
    # |     4|d     |
    # |None  |a     |
    # |None  |None  |
    # |None  |c     |
    # +======+======+
    assert [i for i in outer_join['number']
            ] == [1, 2, 2, 3, 3, 4, 4, None, None, None]
    assert [i for i in outer_join['letter']
            ] == [None, 'a', None, 'b', 'd', 'b', 'd', 'a', None, 'c']

    assert left_join != inner_join
    assert inner_join != outer_join
    assert left_join != outer_join
示例#17
0
def test_basic_table():
    # creating a tablite incrementally is straight forward:
    table = Table()
    table.add_column('A', int, False)
    assert 'A' in table

    table.add_column('B', str, allow_empty=False)
    assert 'B' in table

    # appending rows is easy:
    table.add_row((1, 'hello'))
    table.add_row((2, 'world'))

    # converting to and from json is easy:
    table_as_json = table.to_json()
    table2 = Table.from_json(table_as_json)

    zipped = zlib.compress(table_as_json.encode())
    a, b = len(zipped), len(table_as_json)
    print("zipping reduces to", a, "from", b, "bytes, e.g.",
          round(100 * a / b, 0), "% of original")

    # copying is easy:
    table3 = table.copy()

    table.rename_column('A', 'aa')
    assert list(table.columns) == ['aa', 'B']
    table.rename_column('aa', 'A')
    assert list(table.columns) == ['A', 'B']

    # and checking for headers is simple:
    assert 'A' in table
    assert 'Z' not in table

    # comparisons are straight forward:
    assert table == table2 == table3

    # even if you only want to check metadata:
    table.compare(table3)  # will raise exception if they're different.

    # append is easy as + also work:
    table3x2 = table3 + table3
    assert len(table3x2) == len(table3) * 2

    # and so does +=
    table3x2 += table3
    assert len(table3x2) == len(table3) * 3

    # type verification is included:
    try:
        table.columns['A'][0] = 'Hallo'
        assert False, "A TypeError should have been raised."
    except TypeError:
        assert True

    # updating values is familiar to any user who likes a list:
    assert 'A' in table.columns
    assert isinstance(table.columns['A'], (StoredList, list))
    last_row = -1
    table['A'][last_row] = 44
    table['B'][last_row] = "Hallo"

    assert table != table2

    # if you try to loop and forget the direction, Table will tell you
    try:
        for row in table:  # wont pass
            assert False, "not possible. Use for row in tablite.rows or for column in tablite.columns"
    except AttributeError:
        assert True

    _ = [table2.add_row(row) for row in table.rows]

    before = [r for r in table2.rows]
    assert before == [(1, 'hello'), (2, 'world'), (1, 'hello'), (44, 'Hallo')]

    # as is filtering for ALL that match:
    filter_1 = lambda x: 'llo' in x
    filter_2 = lambda x: x > 3

    after = table2.all(**{'B': filter_1, 'A': filter_2})

    assert list(after.rows) == [(44, 'Hallo')]

    # as is filtering or for ANY that match:
    after = table2.any(**{'B': filter_1, 'A': filter_2})

    assert list(after.rows) == [(1, 'hello'), (1, 'hello'), (44, 'Hallo')]

    # Imagine a tablite with columns a,b,c,d,e (all integers) like this:
    t = Table()
    for c in 'abcde':
        t.add_column(header=c,
                     datatype=int,
                     allow_empty=False,
                     data=[i for i in range(5)])

    # we want to add two new columns using the functions:
    def f1(a, b, c):
        return a + b + c + 1

    def f2(b, c, d):
        return b * c * d

    # and we want to compute two new columns 'f' and 'g':
    t.add_column(header='f', datatype=int, allow_empty=False)
    t.add_column(header='g', datatype=int, allow_empty=True)

    # we can now use the filter, to iterate over the tablite:
    for row in t.filter('a', 'b', 'c', 'd'):
        a, b, c, d = row

        # ... and add the values to the two new columns
        t['f'].append(f1(a, b, c))
        t['g'].append(f2(b, c, d))

    assert len(t) == 5
    assert list(t.columns) == list('abcdefg')
    t.show()

    # slicing is easy:
    table_chunk = table2[2:4]
    assert isinstance(table_chunk, Table)

    # we will handle duplicate names gracefully.
    table2.add_column('B', int, allow_empty=True)
    assert set(table2.columns) == {'A', 'B', 'B_1'}

    # you can delete a column as key...
    del table2['B_1']
    assert set(table2.columns) == {'A', 'B'}

    # adding a computed column is easy:
    table.add_column('new column',
                     str,
                     allow_empty=False,
                     data=[f"{r}" for r in table.rows])

    # part of or the whole tablite is easy:
    table.show()

    table.show('A', slice(0, 1))

    # updating a column with a function is easy:
    f = lambda x: x * 10
    table['A'] = [f(r) for r in table['A']]

    # using regular indexing will also work.
    for ix, r in enumerate(table['A']):
        table['A'][ix] = r * 10

    # and it will tell you if you're not allowed:
    try:
        f = lambda x: f"'{x} as text'"
        table['A'] = [f(r) for r in table['A']]
        assert False, "The line above must raise a TypeError"
    except TypeError as error:
        print("The error is:", str(error))

    # works with all datatypes:
    now = datetime.now()

    table4 = Table()
    table4.add_column('A', int, allow_empty=False, data=[-1, 1])
    table4.add_column('A', int, allow_empty=True, data=[None, 1])  # None!
    table4.add_column('A', float, False, data=[-1.1, 1.1])
    table4.add_column(
        'A', str, False,
        data=["", "1"])  # Empty string is not a None, when dtype is str!
    table4.add_column(
        'A', str, True,
        data=[None, "1"])  # Empty string is not a None, when dtype is str!
    table4.add_column('A', bool, False, data=[False, True])
    table4.add_column('A', datetime, False, data=[now, now])
    table4.add_column('A', date, False, data=[now.date(), now.date()])
    table4.add_column('A', time, False, data=[now.time(), now.time()])

    table4_json = table4.to_json()
    table5 = Table.from_json(table4_json)

    # .. to json and back.
    assert table4 == table5

    # And finally: I can add metadata:
    table5.metadata['db_mapping'] = {
        'A': 'customers.customer_name',
        'A_2': 'product.sku',
        'A_4': 'locations.sender'
    }

    # which also jsonifies without fuzz.
    table5_json = table5.to_json()
    table5_from_json = Table.from_json(table5_json)
    assert table5 == table5_from_json
示例#18
0
def test_filereader_win1251_encoding_csv():
    path = Path(__file__).parent / "data" / 'win1250_test.csv'
    assert path.exists()
    table = list(file_reader(path, sep=';'))[0]
    table.show(slice(0, 10))
    table.show(slice(-15))

    book1_csv = Table(filename=path.name)
    book1_csv.add_column('Item', int)
    book1_csv.add_column('Materiál', str)
    book1_csv.add_column('Objem', float)
    book1_csv.add_column('Jednotka objemu', str)
    book1_csv.add_column('Free Inv Pcs', int)
    assert table.compare(book1_csv), table.compare(book1_csv)
    assert len(table) == 99, len(table)
示例#19
0
def test_add_rows():
    """ Example from docstring. """
    t = Table()
    t.add_column('test', int)
    t.add_column('A', int)
    t.add_column('B', int)
    t.add_column('C', int)
    test_number = count(1)

    # The following examples are all valid and append the row (1,2,3) to the tablite.

    t.add_row(next(test_number), 1, 2, 3)
    t.add_row([next(test_number), 1, 2, 3])
    t.add_row((next(test_number), 1, 2, 3))
    t.add_row(*(next(test_number), 1, 2, 3))
    t.add_row(test=next(test_number), A=1, B=2, C=3)
    t.add_row(**{'test': next(test_number), 'A': 1, 'B': 2, 'C': 3})

    # The following examples add two rows to the tablite

    t.add_row((next(test_number), 1, 2, 3), (next(test_number), 4, 5, 6))
    t.add_row([next(test_number), 1, 2, 3], [next(test_number), 4, 5, 6])
    t.add_row({
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }, {
        'test': next(test_number),
        'A': 4,
        'B': 5,
        'C': 6
    })  # two (or more) dicts as args.
    t.add_row(*[{
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }, {
        'test': next(test_number),
        'A': 1,
        'B': 2,
        'C': 3
    }])  # list of dicts.
    t.show()  # expects:
    # +=====+=====+=====+=====+
    # | test|  A  |  B  |  C  |
    # | int | int | int | int |
    # |False|False|False|False|
    # +-----+-----+-----+-----+
    # |    1|    1|    2|    3|
    # |    2|    1|    2|    3|
    # |    3|    1|    2|    3|
    # |    4|    1|    2|    3|
    # |    5|    1|    2|    3|
    # |    6|    1|    2|    3|
    # |    7|    1|    2|    3|
    # |    8|    4|    5|    6|
    # |    9|    1|    2|    3|
    # |   10|    4|    5|    6|
    # |   11|    1|    2|    3|
    # |   12|    4|    5|    6|
    # |   13|    1|    2|    3|
    # |   14|    1|    2|    3|
    # +=====+=====+=====+=====+
    assert next(test_number) == 14 + 1
示例#20
0
def test_filereader_saptxt():
    path = Path(__file__).parent / "data" / 'sap.txt'
    assert path.exists()
    # test part 1: split using user defined sequence.
    header = "    | Delivery |  Item|Pl.GI date|Route |SC|Ship-to   |SOrg.|Delivery quantity|SU| TO Number|Material    |Dest.act.qty.|BUn|Typ|Source Bin|Cty"
    split_sequence = ["|"] * header.count('|')
    table = list(file_reader(path, split_sequence=split_sequence))[0]
    table.show(slice(5))

    sap_sample = Table(filename=path.name)
    sap_sample.add_column('None', str, True)
    sap_sample.add_column('Delivery', int, False)
    sap_sample.add_column('Item', int, False)
    sap_sample.add_column('Pl.GI date', date, False)
    sap_sample.add_column('Route', str, False)
    sap_sample.add_column('SC', str, False)
    sap_sample.add_column('Ship-to', str, False)
    sap_sample.add_column('SOrg.', str, False)
    sap_sample.add_column('Delivery quantity', int, False)
    sap_sample.add_column('SU', str, False)
    sap_sample.add_column('TO Number', int, False)
    sap_sample.add_column('Material', str, False)
    sap_sample.add_column('Dest.act.qty.', int, False)
    sap_sample.add_column('BUn', str, False)
    sap_sample.add_column('Typ', str, False)
    sap_sample.add_column('Source Bin', str, False)
    sap_sample.add_column('Cty|', str, False)

    assert table.compare(sap_sample)
    assert len(table) == 20, len(table)
示例#21
0
def test_groupby():
    t = Table()
    for c in 'abcde':
        t.add_column(header=c,
                     datatype=int,
                     allow_empty=False,
                     data=[i for i in range(5)])

    # we want to add two new columns using the functions:
    def f1(a, b, c):
        return a + b + c + 1

    def f2(b, c, d):
        return b * c * d

    # and we want to compute two new columns 'f' and 'g':
    t.add_column(header='f', datatype=int, allow_empty=False)
    t.add_column(header='g', datatype=int, allow_empty=True)

    # we can now use the filter, to iterate over the tablite:
    for row in t.filter('a', 'b', 'c', 'd'):
        a, b, c, d = row

        # ... and add the values to the two new columns
        t['f'].append(f1(a, b, c))
        t['g'].append(f2(b, c, d))

    assert len(t) == 5
    assert list(t.columns) == list('abcdefg')
    t.show()

    g = GroupBy(keys=['a', 'b'],
                functions=[('f', gb.max), ('f', gb.min), ('f', gb.sum),
                           ('f', gb.first), ('f', gb.last), ('f', gb.count),
                           ('f', gb.count_unique), ('f', gb.avg),
                           ('f', gb.stdev), ('a', gb.stdev), ('f', gb.median),
                           ('f', gb.mode), ('g', gb.median)])
    t2 = t + t
    assert len(t2) == 2 * len(t)
    t2.show()

    g += t2

    assert list(g.rows) == [
        (0, 0, 1, 1, 2, 1, 1, 2, 1, 1.0, 0.0, 0.0, 1, 1, 0),
        (1, 1, 4, 4, 8, 4, 4, 2, 1, 4.0, 0.0, 0.0, 4, 4, 1),
        (2, 2, 7, 7, 14, 7, 7, 2, 1, 7.0, 0.0, 0.0, 7, 7, 8),
        (3, 3, 10, 10, 20, 10, 10, 2, 1, 10.0, 0.0, 0.0, 10, 10, 27),
        (4, 4, 13, 13, 26, 13, 13, 2, 1, 13.0, 0.0, 0.0, 13, 13, 64)
    ]

    g.table.show()

    g2 = GroupBy(keys=['a', 'b'], functions=[('f', gb.max), ('f', g.sum)])
    g2 += t + t + t

    g2.table.show()

    pivot_table = g2.pivot('b')

    pivot_table.show()
示例#22
0
def test_filereader_123csv():
    csv_file = Path(__file__).parent / "data" / "123.csv"

    table7 = Table(filename=csv_file.name)
    table7.metadata['filename'] = '123.csv'
    table7.add_column('A',
                      int,
                      data=[1, None, 8, 3, 4, 6, 5, 7, 9],
                      allow_empty=True)
    table7.add_column('B', int, data=[10, 100, 1, 1, 1, 1, 10, 10, 10])
    table7.add_column('C', int, data=[0, 1, 0, 1, 0, 1, 0, 1, 0])
    sort_order = {'B': False, 'C': False, 'A': False}
    table7.sort(**sort_order)

    headers = ", ".join([c for c in table7.columns])
    data = [headers]
    for row in table7.rows:
        data.append(", ".join(str(v) for v in row))

    s = "\n".join(data)
    print(s)
    csv_file.write_text(s)  # write
    tr_table = list(file_reader(csv_file))[0]  # read
    csv_file.unlink()  # cleanup

    tr_table.show()
    find_format(tr_table)

    assert tr_table == table7
示例#23
0
def test_recreate_readme_comparison():
    try:
        import os
        import psutil
    except ImportError:
        return
    process = psutil.Process(os.getpid())
    baseline_memory = process.memory_info().rss
    from time import process_time

    from tablite import Table

    digits = 1_000_000

    records = Table()
    records.add_column('method', str)
    records.add_column('memory', int)
    records.add_column('time', float)

    records.add_row(('python', baseline_memory, 0.0))

    # Let's now use the common and convenient "row" based format:

    start = process_time()
    L = []
    for _ in range(digits):
        L.append(tuple([11 for _ in range(10)]))
    end = process_time()

    # go and check taskmanagers memory usage.
    # At this point we're using ~154.2 Mb to store 1 million lists with 10 items.
    records.add_row(
        ('1e6 lists w. 10 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    L.clear()

    # Let's now use a columnar format instead:
    start = process_time()
    L = [[11 for i in range(digits)] for _ in range(10)]
    end = process_time()

    # go and check taskmanagers memory usage.
    # at this point we're using ~98.2 Mb to store 10 lists with 1 million items.
    records.add_row(
        ('10 lists with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))
    L.clear()

    # We've thereby saved 50 Mb by avoiding the overhead from managing 1 million lists.

    # Q: But why didn't I just use an array? It would have even lower memory footprint.
    # A: First, array's don't handle None's and we get that frequently in dirty csv data.
    # Second, Table needs even less memory.

    # Let's start with an array:

    import array
    start = process_time()
    L = [array.array('i', [11 for _ in range(digits)]) for _ in range(10)]
    end = process_time()
    # go and check taskmanagers memory usage.
    # at this point we're using 60.0 Mb to store 10 lists with 1 million integers.

    records.add_row(
        ('10 lists with 1e6 integers in arrays',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))
    L.clear()

    # Now let's use Table:

    start = process_time()
    t = Table()
    for i in range(10):
        t.add_column(str(i),
                     int,
                     allow_empty=False,
                     data=[11 for _ in range(digits)])
    end = process_time()

    records.add_row(
        ('Table with 10 columns with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    # go and check taskmanagers memory usage.
    # At this point we're using  97.5 Mb to store 10 columns with 1 million integers.

    # Next we'll use the api `use_stored_lists` to drop to disk:
    start = process_time()
    t.use_disk = True
    end = process_time()
    records.add_row(
        ('Table on disk with 10 columns with 1e6 integers',
         process.memory_info().rss - baseline_memory, round(end - start, 4)))

    # go and check taskmanagers memory usage.
    # At this point we're using  24.5 Mb to store 10 columns with 1 million integers.
    # Only the metadata remains in pythons memory.

    records.show()