示例#1
0
    def test_lhs_cross_functional_union(self):
        """Test for functional_cross_union."""
        table_a = import_csv(self._get_table_a())
        table_b = import_csv(self._get_table_b())

        self.assertTrue(is_functional(table_a))
        self.assertTrue(is_functional(table_b))

        # Calculate left join.
        result = lhs_cross_functional_union(table_a, table_b)

        # Test result set properties
        self.assertEqual(result.cached_functional, CacheStatus.IS)
        self.assertFalse(result.is_empty)
        self.assertEqual(result.cardinality, 8)
        expected = import_csv(self._get_result_cross_functional_union())
        self.assertEqual(result, expected)

        import algebraixlib.algebras.sets as sets
        table_aa = sets.union(table_a, Set(Set(Couplet('PK', '-1'), Couplet('PK', '-2'))))
        self.assertFalse(is_functional(table_aa))
        result = lhs_cross_functional_union(table_aa, table_b)
        self.assertNotEqual(result.cached_functional, CacheStatus.IS)

        table_bb = sets.union(table_b, Set(Set(Couplet('PK', '-1'), Couplet('PK', '-2'))))
        self.assertFalse(is_functional(table_bb))
        result = lhs_cross_functional_union(table_a, table_bb)
        self.assertEqual(result.cached_functional, CacheStatus.IS)
示例#2
0
def get_customers_nations_projected(nations):
    """Execute the equivalent of the following SQL query, querying the CSV file customer.csv:
        SELECT
            custkey, nationkey, nationname
        FROM
            customer
        JOIN
            nations
        ON
            customer.nationkey = nations.nationkey
    """
    timer = FunctionTimer()
    short_prints = True

    customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float}
    customers = csv.import_csv('customer.csv', customer_types)
    timer.lap('customers', short=short_prints)

    customers_nations = clans.cross_functional_union(customers, nations)
    timer.lap('customers_nations', short=short_prints)

    customers_nations_projected = clans.project(customers_nations, 'custkey',
                                                'nationkey', 'nationname')
    timer.end('customers_nations_projected', short=short_prints)

    return customers_nations_projected
示例#3
0
def get_orders_restricted_projected(startdate, enddate):
    """Execute the equivalent of the following SQL query, querying the CSV file orders.csv:
        SELECT
            orderkey, custkey
        FROM
            orders
        WHERE
            startdate <= orders.orderdate and orders.orderdate < enddate

    :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'.
    :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'.
    """
    timer = FunctionTimer()
    short_prints = True

    def read_date(date_str: str) -> datetime:
        return datetime.strptime(date_str, '%Y-%m-%d').date()

    orders_types = {
        'orderkey': int,
        'custkey': int,
        'orderdate': read_date,
        'totalprice': float,
        'shippriority': int
    }
    orders = csv.import_csv('orders.csv', orders_types)
    timer.lap('orders', short=short_prints)

    def select_dates(rel) -> bool:
        orderdate = rel('orderdate').value
        return (startdate <= orderdate) and (orderdate < enddate)

    orders_restricted = sets.restrict(orders, select_dates)
    timer.lap('orders_restricted', short=short_prints)

    orders_restricted_projected = clans.project(orders_restricted, 'orderkey',
                                                'custkey')
    timer.end('orders_restricted_projected', short=short_prints)

    return orders_restricted_projected
示例#4
0
    def test_csv(self):
        """Test loading clan from csv."""
        clan = Set({Set({Couplet('a', '1'), Couplet('b', '2')})})
        st1 = import_csv(IoCsvTests.path('set1.csv'))
        self.assertEqual(clan, st1)
        self.assertEqual(st1.cached_clan, CacheStatus.IS)
        self.assertEqual(st1.cached_functional, CacheStatus.IS)
        self.assertEqual(st1.cached_regular, CacheStatus.IS)

        clan = Set({Set({Couplet('a', '1'), Couplet('b', '2')})})
        st1a = import_csv(IoCsvTests.path('set1a.csv'))
        # NOTE: duplicate row is removed
        self.assertEqual(clan, st1a)
        self.assertEqual(st1a.cached_clan, CacheStatus.IS)
        self.assertEqual(st1a.cached_functional, CacheStatus.IS)
        self.assertEqual(st1a.cached_regular, CacheStatus.IS)

        clan = Set(
            Set({Couplet('a', '1'),
                 Couplet('b', '2'),
                 Couplet('row', 0)}),
            Set({Couplet('a', '1'),
                 Couplet('b', '2'),
                 Couplet('row', 1)}))
        st1a = import_csv(IoCsvTests.path('set1a.csv'), index_column='row')
        # NOTE: duplicate row is NOT removed
        self.assertEqual(clan, st1a)
        self.assertEqual(st1a.cached_clan, CacheStatus.IS)
        self.assertEqual(st1a.cached_functional, CacheStatus.IS)
        self.assertEqual(st1a.cached_regular, CacheStatus.IS)

        expected = Multiset({Set(Couplet('a', '1'), Couplet('b', '2')): 2})
        actual = import_csv(IoCsvTests.path('set1a.csv'), has_dup_rows=True)
        # NOTE: duplicate row is not removed
        self.assertEqual(actual, expected)

        self.assertEqual(actual.cached_multiclan, CacheStatus.IS)
        self.assertEqual(actual.cached_functional, CacheStatus.IS)
        self.assertEqual(actual.cached_regular, CacheStatus.IS)

        clan = Set({
            Set({Couplet('a', '1'), Couplet('b', '2')}),
            Set({Couplet('a', '3'), Couplet('b', '4')})
        })
        st2 = import_csv(IoCsvTests.path('set2.csv'))
        self.assertEqual(clan, st2)
        self.assertEqual(st2.cached_clan, CacheStatus.IS)
        self.assertEqual(st2.cached_functional, CacheStatus.IS)
        self.assertEqual(st2.cached_regular, CacheStatus.IS)

        clan = Set(Set([Couplet(s, c) for s, c in zip('abcd', [1, 2, 3, 4])]),
                   Set([Couplet(s, c) for s, c in zip('abc', [5, 6, 7])]),
                   Set([Couplet(s, c) for s, c in zip('bd', [8, 9])]))
        types = {'a': int, 'b': int, 'c': int, 'd': int}
        st3 = import_csv(IoCsvTests.path('set3.csv'), types)
        # print("expected", clan)
        # print("actual", st3)
        self.assertEqual(clan, st3)
        self.assertEqual(st3.cached_clan, CacheStatus.IS)
        self.assertEqual(st3.cached_functional, CacheStatus.IS)
        self.assertEqual(st3.cached_regular, CacheStatus.IS_NOT)

        expected = Multiset({
            Set(Couplet('a', '1'), Couplet('b', '2'), Couplet('c', '3'),
                Couplet('d', '4')):
            1,
            Set(Couplet('a', '5'), Couplet('b', '6'), Couplet('c', '7')):
            1,
            Set(Couplet('b', '8'), Couplet('d', '9')):
            1
        })
        actual = import_csv(IoCsvTests.path('set3.csv'), has_dup_rows=True)
        self.assertEqual(actual, expected)

        self.assertEqual(actual.cached_multiclan, CacheStatus.IS)
        self.assertEqual(actual.cached_functional, CacheStatus.IS)
        self.assertEqual(actual.cached_regular, CacheStatus.IS_NOT)

        expected = Multiset({
            Set(Couplet('a', '5'), Couplet('b', '6'), Couplet('c', '7')):
            1,
            Set(Couplet('b', '8'), Couplet('d', '9')):
            1,
            Set(Couplet('a', '1'), Couplet('b', '2'), Couplet('c', '3'),
                Couplet('d', '4')):
            1
        })
        export_path = IoCsvTests.path('export.csv')
        self.assertFalse(export_csv(expected, export_path))
        self.assertTrue(export_csv(expected, export_path,
                                   ordered_lefts='abcd'))
        actual = import_csv(export_path, has_dup_rows=True)
        self.assertEqual(actual, expected)
示例#5
0
def query5():
    # select
    #     nationname,
    #     sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue
    # from
    #     customer, orders, lineitem,   -- Loaded from CSV
    #     nation, region                -- Loaded from XML
    # where
    #     customer.custkey = orders.custkey
    #     and lineitem.orderkey = orders.orderkey
    #     and customer.nationkey = nation.nationkey
    #     and supplier.nationkey = nation.nationkey
    #     and nation.regionkey = region.regionkey
    #     and region.name = 'AMERICA'
    #     and orders.orderdate >= date '1996-01-01'
    #     and orders.orderdate < date '1996-01-01' + interval '1' year
    # group by
    #     n_name
    timer = FunctionTimer()
    short_prints = True

    # Join supplier_solutions and customers_nations_projected on 'nationkey'.
    result1 = clans.cross_functional_union(
        get_supplier_solutions(),
        get_customers_nations_projected(get_nations(region_name)))
    timer.lap('result1', short=short_prints)

    # Join result1 with orders_restricted_projected on 'custkey'.
    result2 = clans.cross_functional_union(
        result1, get_orders_restricted_projected(start_date, end_date))
    timer.lap('result2', short=short_prints)

    # Join result with lineitem on 'orderkey' and 'suppkey'.
    lineitem_types = {
        'orderkey': int,
        'suppkey': int,
        'extendedprice': float,
        'discount': float,
        'partkey': int,
        'linenumber': int,
        'quantity': int,
        'tax': float,
    }
    result3 = clans.cross_functional_union(
        result2, csv.import_csv('lineitem.csv', lineitem_types))
    timer.lap('result3', short=short_prints)

    # Add the 'revenue' column.
    def calc_revenue(rel):
        return Couplet(
            'revenue',
            rel('extendedprice').value * (1 - rel('discount').value))

    result4 = Set(
        relations.functional_add(rel, calc_revenue(rel)) for rel in result3)
    timer.lap('result4', short=short_prints)
    # Remove unnecessary columns.
    revenue_by_nations = clans.project(result4, 'revenue', 'nationname')

    # Partition the result on 'nationname'.
    revenue_grouped_by_nations = partition.partition(
        revenue_by_nations, lambda rel: rel('nationname'))
    timer.lap('revenue_grouped_by_nations', short=short_prints)

    # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.)
    def aggregate(horde, group_left, aggregation_left, aggregate_func):
        aggregation = {}
        for clan in horde:
            aggregation_value = aggregate_func.identity
            for relation in clan:
                aggregation_value = aggregate_func(
                    aggregation_value,
                    relation(aggregation_left).value)
            first_relation = next(iter(clan))
            aggregation[first_relation(group_left)] = aggregation_value
        return Set([
            Set(Couplet(group_left, key),
                Couplet(aggregation_left, aggregation[key]))
            for key in aggregation
        ])

    # Our aggregation function (adding two numbers, identity is 0).
    def aggregate_sum(arg1, arg2):
        return arg1 + arg2

    aggregate_sum.identity = 0

    # Calculate the aggregation result.
    # noinspection PyTypeChecker
    query5_result = aggregate(revenue_grouped_by_nations, 'nationname',
                              'revenue', aggregate_sum)
    timer.end('query5_result')

    return query5_result
示例#6
0
peach,doug
rice,frank
rice,frank
banana,doug
apple,doug
rice,jane
apple,jane
"""

# Tables can be modeled as multisets of binary relations, which we call clans.
from io import StringIO
from algebraixlib.import_export.csv import import_csv
from algebraixlib.import_export.csv import export_csv

file = StringIO(sales_csv)
sales_multiclan = import_csv(
    file, has_dup_rows=True)  # note the use of flag to return a multiclan
print(sales_multiclan)

# lets see how our cashiers are doing
cashier_diagonal = _mo.Multiset(
    {_mo.Set(_mo.Couplet('cashier', 'cashier')): 1})
cashier_sales = _multiclans.compose(sales_multiclan, cashier_diagonal)
print(
    "To find out how many sales each cashier has had, we can do the following:"
)
print(str("sales_multiclan") + ' COMPOSE ' + str(cashier_diagonal))
print('=> EVALUATES TO ' + str(cashier_sales) + "\n")

# lets see what products we sold for the day
product_diagonal = _mo.Multiset(
    {_mo.Set(_mo.Couplet('product', 'product')): 1})
示例#7
0
gallo,Spanish,rooster
Duniy?,Hindi,earth
Kon'nichiwa,Japanese,salutation
hallo,German,salutation
nuqneH,Klingon,salutation
sekai,Japanese,earth
schmetterling,German,butterfly
mariposa,Spanish,butterfly
"""

# Tables can be modeled as sets of binary relations, which we call clans.
from io import StringIO
from algebraixlib.import_export.csv import import_csv

file = StringIO(vocab_csv)
vocab_clan = import_csv(file)
print("vocab_clan:", vocab_clan)

# Superstriction(A, B) is a partial binary operation on sets. It is defined as A if A is a superset
# of B, otherwise it is undefined.
hello_relation = Set(Couplet('word', 'hello'), Couplet('language', 'English'),
                     Couplet('meaning', 'salutation'))
super_pos = sets.superstrict(hello_relation,
                             Set(Couplet('language', 'English')))
super_neg = sets.superstrict(hello_relation,
                             Set(Couplet('language', 'Mandarin')))
print(super_pos)
print(super_neg)

# By extending superstriction to clans, which are sets of sets (of Couplets), we can define a
# helpful mechanism to restrict vocab_clan to only those relations that contain particular values.