def test_lhs_cross_functional_union(self): """Test for functional_cross_union.""" table_a = import_csv(self._get_table_a()) table_b = import_csv(self._get_table_b()) self.assertTrue(is_functional(table_a)) self.assertTrue(is_functional(table_b)) # Calculate left join. result = lhs_cross_functional_union(table_a, table_b) # Test result set properties self.assertEqual(result.cached_functional, CacheStatus.IS) self.assertFalse(result.is_empty) self.assertEqual(result.cardinality, 8) expected = import_csv(self._get_result_cross_functional_union()) self.assertEqual(result, expected) import algebraixlib.algebras.sets as sets table_aa = sets.union(table_a, Set(Set(Couplet('PK', '-1'), Couplet('PK', '-2')))) self.assertFalse(is_functional(table_aa)) result = lhs_cross_functional_union(table_aa, table_b) self.assertNotEqual(result.cached_functional, CacheStatus.IS) table_bb = sets.union(table_b, Set(Set(Couplet('PK', '-1'), Couplet('PK', '-2')))) self.assertFalse(is_functional(table_bb)) result = lhs_cross_functional_union(table_a, table_bb) self.assertEqual(result.cached_functional, CacheStatus.IS)
def get_customers_nations_projected(nations): """Execute the equivalent of the following SQL query, querying the CSV file customer.csv: SELECT custkey, nationkey, nationname FROM customer JOIN nations ON customer.nationkey = nations.nationkey """ timer = FunctionTimer() short_prints = True customer_types = {'custkey': int, 'nationkey': int, 'acctbal': float} customers = csv.import_csv('customer.csv', customer_types) timer.lap('customers', short=short_prints) customers_nations = clans.cross_functional_union(customers, nations) timer.lap('customers_nations', short=short_prints) customers_nations_projected = clans.project(customers_nations, 'custkey', 'nationkey', 'nationname') timer.end('customers_nations_projected', short=short_prints) return customers_nations_projected
def get_orders_restricted_projected(startdate, enddate): """Execute the equivalent of the following SQL query, querying the CSV file orders.csv: SELECT orderkey, custkey FROM orders WHERE startdate <= orders.orderdate and orders.orderdate < enddate :param startdate: The lower boundary (inclusive) of the date range for the column 'orderdate'. :param enddate: The upper boundary (exclusive) of the date range for the column 'orderdate'. """ timer = FunctionTimer() short_prints = True def read_date(date_str: str) -> datetime: return datetime.strptime(date_str, '%Y-%m-%d').date() orders_types = { 'orderkey': int, 'custkey': int, 'orderdate': read_date, 'totalprice': float, 'shippriority': int } orders = csv.import_csv('orders.csv', orders_types) timer.lap('orders', short=short_prints) def select_dates(rel) -> bool: orderdate = rel('orderdate').value return (startdate <= orderdate) and (orderdate < enddate) orders_restricted = sets.restrict(orders, select_dates) timer.lap('orders_restricted', short=short_prints) orders_restricted_projected = clans.project(orders_restricted, 'orderkey', 'custkey') timer.end('orders_restricted_projected', short=short_prints) return orders_restricted_projected
def test_csv(self): """Test loading clan from csv.""" clan = Set({Set({Couplet('a', '1'), Couplet('b', '2')})}) st1 = import_csv(IoCsvTests.path('set1.csv')) self.assertEqual(clan, st1) self.assertEqual(st1.cached_clan, CacheStatus.IS) self.assertEqual(st1.cached_functional, CacheStatus.IS) self.assertEqual(st1.cached_regular, CacheStatus.IS) clan = Set({Set({Couplet('a', '1'), Couplet('b', '2')})}) st1a = import_csv(IoCsvTests.path('set1a.csv')) # NOTE: duplicate row is removed self.assertEqual(clan, st1a) self.assertEqual(st1a.cached_clan, CacheStatus.IS) self.assertEqual(st1a.cached_functional, CacheStatus.IS) self.assertEqual(st1a.cached_regular, CacheStatus.IS) clan = Set( Set({Couplet('a', '1'), Couplet('b', '2'), Couplet('row', 0)}), Set({Couplet('a', '1'), Couplet('b', '2'), Couplet('row', 1)})) st1a = import_csv(IoCsvTests.path('set1a.csv'), index_column='row') # NOTE: duplicate row is NOT removed self.assertEqual(clan, st1a) self.assertEqual(st1a.cached_clan, CacheStatus.IS) self.assertEqual(st1a.cached_functional, CacheStatus.IS) self.assertEqual(st1a.cached_regular, CacheStatus.IS) expected = Multiset({Set(Couplet('a', '1'), Couplet('b', '2')): 2}) actual = import_csv(IoCsvTests.path('set1a.csv'), has_dup_rows=True) # NOTE: duplicate row is not removed self.assertEqual(actual, expected) self.assertEqual(actual.cached_multiclan, CacheStatus.IS) self.assertEqual(actual.cached_functional, CacheStatus.IS) self.assertEqual(actual.cached_regular, CacheStatus.IS) clan = Set({ Set({Couplet('a', '1'), Couplet('b', '2')}), Set({Couplet('a', '3'), Couplet('b', '4')}) }) st2 = import_csv(IoCsvTests.path('set2.csv')) self.assertEqual(clan, st2) self.assertEqual(st2.cached_clan, CacheStatus.IS) self.assertEqual(st2.cached_functional, CacheStatus.IS) self.assertEqual(st2.cached_regular, CacheStatus.IS) clan = Set(Set([Couplet(s, c) for s, c in zip('abcd', [1, 2, 3, 4])]), Set([Couplet(s, c) for s, c in zip('abc', [5, 6, 7])]), Set([Couplet(s, c) for s, c in zip('bd', [8, 9])])) types = {'a': int, 'b': int, 'c': int, 'd': int} st3 = import_csv(IoCsvTests.path('set3.csv'), types) # print("expected", clan) # print("actual", st3) self.assertEqual(clan, st3) self.assertEqual(st3.cached_clan, CacheStatus.IS) self.assertEqual(st3.cached_functional, CacheStatus.IS) self.assertEqual(st3.cached_regular, CacheStatus.IS_NOT) expected = Multiset({ Set(Couplet('a', '1'), Couplet('b', '2'), Couplet('c', '3'), Couplet('d', '4')): 1, Set(Couplet('a', '5'), Couplet('b', '6'), Couplet('c', '7')): 1, Set(Couplet('b', '8'), Couplet('d', '9')): 1 }) actual = import_csv(IoCsvTests.path('set3.csv'), has_dup_rows=True) self.assertEqual(actual, expected) self.assertEqual(actual.cached_multiclan, CacheStatus.IS) self.assertEqual(actual.cached_functional, CacheStatus.IS) self.assertEqual(actual.cached_regular, CacheStatus.IS_NOT) expected = Multiset({ Set(Couplet('a', '5'), Couplet('b', '6'), Couplet('c', '7')): 1, Set(Couplet('b', '8'), Couplet('d', '9')): 1, Set(Couplet('a', '1'), Couplet('b', '2'), Couplet('c', '3'), Couplet('d', '4')): 1 }) export_path = IoCsvTests.path('export.csv') self.assertFalse(export_csv(expected, export_path)) self.assertTrue(export_csv(expected, export_path, ordered_lefts='abcd')) actual = import_csv(export_path, has_dup_rows=True) self.assertEqual(actual, expected)
def query5(): # select # nationname, # sum(lineitem.extendedprice * (1 - lineitem.discount)) as revenue # from # customer, orders, lineitem, -- Loaded from CSV # nation, region -- Loaded from XML # where # customer.custkey = orders.custkey # and lineitem.orderkey = orders.orderkey # and customer.nationkey = nation.nationkey # and supplier.nationkey = nation.nationkey # and nation.regionkey = region.regionkey # and region.name = 'AMERICA' # and orders.orderdate >= date '1996-01-01' # and orders.orderdate < date '1996-01-01' + interval '1' year # group by # n_name timer = FunctionTimer() short_prints = True # Join supplier_solutions and customers_nations_projected on 'nationkey'. result1 = clans.cross_functional_union( get_supplier_solutions(), get_customers_nations_projected(get_nations(region_name))) timer.lap('result1', short=short_prints) # Join result1 with orders_restricted_projected on 'custkey'. result2 = clans.cross_functional_union( result1, get_orders_restricted_projected(start_date, end_date)) timer.lap('result2', short=short_prints) # Join result with lineitem on 'orderkey' and 'suppkey'. lineitem_types = { 'orderkey': int, 'suppkey': int, 'extendedprice': float, 'discount': float, 'partkey': int, 'linenumber': int, 'quantity': int, 'tax': float, } result3 = clans.cross_functional_union( result2, csv.import_csv('lineitem.csv', lineitem_types)) timer.lap('result3', short=short_prints) # Add the 'revenue' column. def calc_revenue(rel): return Couplet( 'revenue', rel('extendedprice').value * (1 - rel('discount').value)) result4 = Set( relations.functional_add(rel, calc_revenue(rel)) for rel in result3) timer.lap('result4', short=short_prints) # Remove unnecessary columns. revenue_by_nations = clans.project(result4, 'revenue', 'nationname') # Partition the result on 'nationname'. revenue_grouped_by_nations = partition.partition( revenue_by_nations, lambda rel: rel('nationname')) timer.lap('revenue_grouped_by_nations', short=short_prints) # Almost generic aggregation function. (Handles 'normal' cases, but not all edge cases.) def aggregate(horde, group_left, aggregation_left, aggregate_func): aggregation = {} for clan in horde: aggregation_value = aggregate_func.identity for relation in clan: aggregation_value = aggregate_func( aggregation_value, relation(aggregation_left).value) first_relation = next(iter(clan)) aggregation[first_relation(group_left)] = aggregation_value return Set([ Set(Couplet(group_left, key), Couplet(aggregation_left, aggregation[key])) for key in aggregation ]) # Our aggregation function (adding two numbers, identity is 0). def aggregate_sum(arg1, arg2): return arg1 + arg2 aggregate_sum.identity = 0 # Calculate the aggregation result. # noinspection PyTypeChecker query5_result = aggregate(revenue_grouped_by_nations, 'nationname', 'revenue', aggregate_sum) timer.end('query5_result') return query5_result
peach,doug rice,frank rice,frank banana,doug apple,doug rice,jane apple,jane """ # Tables can be modeled as multisets of binary relations, which we call clans. from io import StringIO from algebraixlib.import_export.csv import import_csv from algebraixlib.import_export.csv import export_csv file = StringIO(sales_csv) sales_multiclan = import_csv( file, has_dup_rows=True) # note the use of flag to return a multiclan print(sales_multiclan) # lets see how our cashiers are doing cashier_diagonal = _mo.Multiset( {_mo.Set(_mo.Couplet('cashier', 'cashier')): 1}) cashier_sales = _multiclans.compose(sales_multiclan, cashier_diagonal) print( "To find out how many sales each cashier has had, we can do the following:" ) print(str("sales_multiclan") + ' COMPOSE ' + str(cashier_diagonal)) print('=> EVALUATES TO ' + str(cashier_sales) + "\n") # lets see what products we sold for the day product_diagonal = _mo.Multiset( {_mo.Set(_mo.Couplet('product', 'product')): 1})
gallo,Spanish,rooster Duniy?,Hindi,earth Kon'nichiwa,Japanese,salutation hallo,German,salutation nuqneH,Klingon,salutation sekai,Japanese,earth schmetterling,German,butterfly mariposa,Spanish,butterfly """ # Tables can be modeled as sets of binary relations, which we call clans. from io import StringIO from algebraixlib.import_export.csv import import_csv file = StringIO(vocab_csv) vocab_clan = import_csv(file) print("vocab_clan:", vocab_clan) # Superstriction(A, B) is a partial binary operation on sets. It is defined as A if A is a superset # of B, otherwise it is undefined. hello_relation = Set(Couplet('word', 'hello'), Couplet('language', 'English'), Couplet('meaning', 'salutation')) super_pos = sets.superstrict(hello_relation, Set(Couplet('language', 'English'))) super_neg = sets.superstrict(hello_relation, Set(Couplet('language', 'Mandarin'))) print(super_pos) print(super_neg) # By extending superstriction to clans, which are sets of sets (of Couplets), we can define a # helpful mechanism to restrict vocab_clan to only those relations that contain particular values.