def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in2", cols_in_2, {2}) cols_in_3 = [ defCol("a", "INTEGER", [3]), defCol("b", "INTEGER", [3]) ] in_3 = cc.create("in_3", cols_in_3, {3}) # combine parties' inputs into one relation rel = cc.concat([in_1, in_2, in_3], "rel") proj = cc.project(rel, "proj", ["a", "b"]) agg = cc.aggregate(proj, "agg", ["a"], "b", "sum", "total_b") div = cc.divide(agg, "div", "a", ["a", 1]) mult = cc.multiply(div, "mult", "a", ["a", 1]) cc.collect(mult, 1) # return root nodes return {in_1, in_2, in_3}
def protocol(): govreg_cols = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] govreg = cc.create("a_govreg", govreg_cols, {1}) govreg_dummy = cc.project(govreg, "govreg_dummy", ["a", "b"]) company0_cols = [ defCol("c", "INTEGER", [1], [2]), defCol("d", "INTEGER", [2]) ] company0 = cc.create("company0", company0_cols, {2}) company0_dummy = cc.project(company0, "company0_dummy", ["c", "d"]) company1_cols = [ defCol("c", "INTEGER", [1], [3]), defCol("d", "INTEGER", [3]) ] company1 = cc.create("company1", company1_cols, {3}) company1_dummy = cc.project(company1, "company1_dummy", ["c", "d"]) companies = cc.concat([company0_dummy, company1_dummy], "companies") joined = cc.join(govreg_dummy, companies, "joined", ["a"], ["c"]) res = cc.aggregate(joined, "actual", ["b"], "d", "sum", "total") cc.collect(res, 1) return {govreg, company0, company1}
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in_2", cols_in_2, {2}) # combine parties' inputs into one relation rel = cc.concat([in_1, in_2], "rel") # specify the workflow proj_a = cc.project(rel, "proj_a", ["a", "b"]) proj_b = cc.project(proj_a, "proj_b", ["a", "b"]) agg = cc.aggregate(proj_b, "agg", ["a"], "b", "sum", "total_b") proj_c = cc.project(agg, "proj_c", ["a", "total_b"]) cc.collect(proj_c, 1) # return root nodes return {in_1, in_2}
def protocol(): govreg_cols = [ defCol("a", "INTEGER", 1), defCol("b", "INTEGER", 1) ] govreg = cc.create("govreg", govreg_cols, {1}) company0_cols = [ defCol("c", "INTEGER", 2), defCol("d", "INTEGER", 2) ] company0 = cc.create("company0", company0_cols, {2}) company1_cols = [ defCol("c", "INTEGER", 3), defCol("d", "INTEGER", 3) ] company1 = cc.create("company1", company1_cols, {3}) companies = cc.concat([company0, company1], "companies") joined = cc.join(govreg, companies, "joined", ["a"], ["c"]) actual = cc.aggregate(joined, "actual", ["b"], "d", "sum", "total") cc.collect(actual, 1) return {govreg, company0, company1}
def protocol(): cols_in_1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] in_1 = sal.create("yellow1", cols_in_1, set([1])) cols_in_2 = [ defCol("companyID", "INTEGER", [2]), defCol("price", "INTEGER", [2]) ] in_2 = sal.create("yellow2", cols_in_2, set([2])) cols_in_3 = [ defCol("companyID", "INTEGER", [3]), defCol("price", "INTEGER", [3]) ] in_3 = sal.create("yellow3", cols_in_3, set([3])) cab_data = sal.concat([in_1, in_2, in_3], "cab_data") selected_input = sal.project( cab_data, "selected_input", ["companyID", "price"]) local_rev = sal.aggregate(selected_input, "local_rev", [ "companyID"], "price", "+", "local_rev") scaled_down = sal.divide( local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = sal.multiply( scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = sal.multiply( first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = sal.aggregate(first_val_blank, "total_rev", [ "companyID"], "local_rev", "+", "global_rev") local_total_rev = sal.join(local_rev_scaled, total_rev, "local_total_rev", [ "companyID"], ["companyID"]) market_share = sal.divide(local_total_rev, "market_share", "local_rev", [ "local_rev", "global_rev"]) market_share_squared = sal.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = sal.aggregate(market_share_squared, "hhi", [ "companyID"], "local_rev", "+", "hhi") # dummy projection to force non-mpc subdag hhi_only = sal.project( hhi, "hhi_only", ["companyID", "hhi"]) sal.collect(hhi_only, 1) # return root nodes return set([in_1, in_2, in_3])
def protocol(): inpts = setup() in_1 = inpts[0] agg = cc.aggregate(in_1, "agg", ["a", "b"], "c", "sum", "agg_1") out = cc.collect(agg, 1) return {in_1}
def protocol(): cols_in_1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] cols_in_2 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] cols_in_3 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] yellow1 = sal.create("yellow1", cols_in_1, {1}) yellow2 = sal.create("yellow2", cols_in_2, {1}) yellow3 = sal.create("yellow3", cols_in_3, {1}) cab_data = sal.concat([yellow1, yellow2, yellow3], "cab_data") selected_input = sal.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = sal.aggregate(selected_input, "local_rev", ["companyID"], "price", "+", "local_rev") scaled_down = sal.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = sal.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = sal.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = sal.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "+", "global_rev") local_total_rev = sal.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = sal.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = sal.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = sal.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "+", "hhi") sal.collect(hhi, 1) return {yellow1, yellow2, yellow3}
def protocol(): inpts = setup() in_1 = inpts[0] agg = sal.aggregate(in_1, "agg", ["a", "b"], "c", "sum", "agg_1") out = sal.collect(agg, 1) return set([in_1])
def protocol(): cols_in_1 = [ defCol("companyID", "INTEGER", [1]), defCol("price", "INTEGER", [1]) ] in1 = cc.create("in1", cols_in_1, {1}) cols_in_2 = [ defCol("companyID", "INTEGER", [2]), defCol("price", "INTEGER", [2]) ] in2 = cc.create("in2", cols_in_2, {2}) cols_in_3 = [ defCol("companyID", "INTEGER", [3]), defCol("price", "INTEGER", [3]) ] in3 = cc.create("in3", cols_in_3, {3}) cab_data = cc.concat([in1, in2, in3], "cab_data") selected_input = cc.project(cab_data, "selected_input", ["companyID", "price"]) local_rev = cc.aggregate(selected_input, "local_rev", ["companyID"], "price", "sum", "local_rev") scaled_down = cc.divide(local_rev, "scaled_down", "local_rev", ["local_rev", 1000]) first_val_blank = cc.multiply(scaled_down, "first_val_blank", "companyID", ["companyID", 0]) local_rev_scaled = cc.multiply(first_val_blank, "local_rev_scaled", "local_rev", ["local_rev", 100]) total_rev = cc.aggregate(first_val_blank, "total_rev", ["companyID"], "local_rev", "sum", "global_rev") local_total_rev = cc.join(local_rev_scaled, total_rev, "local_total_rev", ["companyID"], ["companyID"]) market_share = cc.divide(local_total_rev, "market_share", "local_rev", ["local_rev", "global_rev"]) market_share_squared = cc.multiply(market_share, "market_share_squared", "local_rev", ["local_rev", "local_rev", 1]) hhi = cc.aggregate(market_share_squared, "hhi", ["companyID"], "local_rev", "sum", "hhi") cc.collect(hhi, 1) # return root nodes return {in1, in2, in3}
def agg(): in1 = setup()[0] agg = sal.aggregate(in1, "agg", ["a", "b"], "c", "sum", "agg1") out = sal.collect(agg, 1) return set([in1])
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] mult = sal.multiply(in_1, "mult", "a", ["b", "c"]) proj_2 = sal.project(in_2, "proj_2", ["a", "b"]) join = sal.join(mult, proj_2, "join", ["a", "b"], ["a", "b"]) agg = sal.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") out = sal.collect(agg, 1) return {in_1, in_2}
def protocol(): colsInA = [ defCol('a', 'INTEGER', [1]), defCol('b', 'INTEGER', [1]), ] in1 = sal.create("in1", colsInA, set([1])) agg1 = sal.aggregate(in1, 'agg1', ['a'], 'b', '+', 'b') return set([in1])
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] div_1 = cc.divide(in_1, "div", "a", ["a", "b"]) mult_2 = cc.multiply(in_2, "mult", "a", ["a", "b"]) proj_1 = cc.project(div_1, "proj", ["a", "b"]) join = cc.join(proj_1, mult_2, "join", ["a", "b"], ["a", "b"]) agg = cc.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") cc.collect(agg, 1) return {in_1, in_2}
def protocol(): inpts = setup() in_1, in_2 = inpts[0], inpts[1] div_1 = sal.divide(in_1, "div", "a", ["a", "b"]) mult_2 = sal.multiply(in_2, "mult", "a", ["a", "b"]) proj_1 = sal.project(div_1, "proj", ["a", "b"]) join = sal.join(proj_1, mult_2, "join", ["a", "b"], ["a", "b"]) agg = sal.aggregate(join, "agg", ["a", "b"], "c", "sum", "agg_1") out = sal.collect(agg, 1) return set([in_1, in_2])
def protocol(): left_cols = [defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1])] left = cc.create("left", left_cols, {1}) left_dummy = cc.project(left, "left_dummy", ["a", "b"]) right_cols = [defCol("c", "INTEGER", [2]), defCol("d", "INTEGER", [2])] right = cc.create("right", right_cols, {2}) right_dummy = cc.project(right, "right_dummy", ["c", "d"]) joined = cc.join(left_dummy, right_dummy, "joined", ["a"], ["c"]) cc.collect(cc.aggregate(joined, "actual", ["b"], "d", "sum", "total"), 1) return {left, right}
def protocol(): """ A demo protocol which reads data from data/input_relation.csv, computes a multiplication, followed by an aggregation, and stores the result under data/aggregated.csv. :return set of input relations """ # define the input schema, providing column name, type, and trust set input_columns = [ defCol("column_a", "INTEGER", [1]), defCol("column_b", "INTEGER", [1]) ] # define input relation, providing relation name, columns, and owner set input_relation = lang.create("input_relation", input_columns, {1}) # square column_b, i.e., compute (column_a, column_b) -> (column_a, column_b * column_b) squared = lang.multiply(input_relation, "squared", "column_b", ["column_b", "column_b"]) # sum group by column_a on column_b and rename group-over column to summed lang.aggregate(squared, "aggregated", ["column_a"], "column_b", "+", "summed") # leaf nodes are automatically written to file so aggregated will be written to ./data/aggregated.csv # return all input relations return {input_relation}
def protocol(): input_columns_left = [ defCol("column_a", "INTEGER", [1]), defCol("column_b", "INTEGER", [1]) ] left = cc.create("left", input_columns_left, {1}) input_columns_right = [ defCol("column_a", "INTEGER", [1]), defCol("column_c", "INTEGER", [1]) ] right = cc.create("right", input_columns_right, {1}) cc.collect( cc.aggregate(cc.concat([left, right], "rel"), "expected", ["column_a"], "column_b", "sum", "total_b"), 1) return {left, right}
def protocol(): cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = cc.create("in_1", cols_in_1, {1}) cols_in_2 = [ defCol("a", "INTEGER", [1], [2]), defCol("b", "INTEGER", [2]) ] in_2 = cc.create("in_2", cols_in_2, {2}) cc.collect( cc.aggregate(cc.concat([in_1, in_2], "rel"), "agg", ["a"], "b", "sum", "total_b"), 1) return {in_1, in_2}
def protocol(): input_columns_left = [ defCol("column_a", "INTEGER", [1]), defCol("column_b", "INTEGER", [1]) ] left = cc.create("left", input_columns_left, {1}) input_columns_right = [ defCol("column_a", "INTEGER", [1], [2]), defCol("column_c", "INTEGER", [1]) ] right = cc.create("right", input_columns_right, {2}) aggregated = cc.aggregate(cc.concat([left, right], "rel"), "actual", ["column_a"], "column_b", "sum", "total_b") actual_open = cc.project(aggregated, "actual_open", ["column_a", "total_b"]) cc.collect(actual_open, 1) return {left, right}
def protocol(): # define inputs colsIn1 = [defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1])] in1 = sal.create("in1", colsIn1, set([1])) colsIn2 = [defCol("c", "INTEGER", [2]), defCol("d", "INTEGER", [2])] in2 = sal.create("in2", colsIn2, set([2])) colsIn3 = [defCol("e", "INTEGER", [3]), defCol("f", "INTEGER", [3])] in3 = sal.create("in3", colsIn3, set([3])) cl1 = sal._close(in1, "cl1", set([1, 2, 3])) cl2 = sal._close(in2, "cl2", set([1, 2, 3])) cl3 = sal._close(in3, "cl3", set([1, 2, 3])) rel = sal.concat([cl1, cl2, cl3], "rel") agg = sal.aggregate(rel, "agg", ["a"], "b", "+", "total") opened = sal._open(agg, "opened", 1) # return root nodes return set([in1, in2, in3])
def protocol(): # define inputs colsIn1 = [defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1])] in1 = sal.create("govreg", colsIn1, set([1])) colsIn2 = [defCol("c", "INTEGER", [2]), defCol("d", "INTEGER", [2])] in2 = sal.create("company0", colsIn2, set([2])) colsIn3 = [defCol("c", "INTEGER", [3]), defCol("d", "INTEGER", [3])] in3 = sal.create("company1", colsIn3, set([3])) cl1 = sal._close(in1, "cl1", set([1, 2, 3])) projA = sal.project(cl1, "projA", ["a", "b"]) cl2 = sal._close(in2, "cl2", set([1, 2, 3])) cl3 = sal._close(in3, "cl3", set([1, 2, 3])) right_rel = sal.concat([cl2, cl3], "right_rel") projB = sal.project(right_rel, "projB", ["c", "d"]) joined = sal.join(projA, right_rel, "joined", ["a"], ["c"]) agg = sal.aggregate(joined, "agg", ["b"], "d", "+", "total") opened = sal._open(agg, "opened", 1) return set([in1, in2, in3])
def protocol(): # define inputs cols_in_1 = [ defCol("a", "INTEGER", [1]), defCol("b", "INTEGER", [1]) ] in_1 = sal.create("in_1", cols_in_1, set([1])) cols_in_2 = [ defCol("a", "INTEGER", [2]), defCol("b", "INTEGER", [2]) ] in_2 = sal.create("in_2", cols_in_2, set([2])) # combine parties' inputs into one relation rel = sal.concat([in_1, in_2], "rel") # specify the workflow agg = sal.aggregate(rel, "agg", ["a"], "b", "+", "total_b") sal.collect(agg, 1) # return root nodes return set([in_1, in_2])
def protocol(): cols_concatenated_DFs = [ defCol("store_code_uc", "STRING", [1]), defCol('upc', 'STRING', [1]), defCol('week_end', 'STRING', [1]), defCol('q', 'INTEGER', [1]), defCol('avg_unit_p', 'FLOAT', [1]), defCol('retailer_code', 'STRING', [1]), defCol('store_zip3', 'STRING', [1]) ] cols_temp_UPC_brandBU_crspnd = [ defCol('brand_code_bu', 'STRING', [2]), defCol('brand_descr_bu', 'STRING', [2]), defCol('upc', 'STRING', [2]), defCol('size1_amount', 'FLOAT', [2]), ] # concatenated DFs from local_workflow.py concatenated_DFs = sal.create('concatenated_DFs', cols_concatenated_DFs, set([1])) # the output of preprocess_products.py temp_UPC_brandBU_crspnd = sal.create('temp_UPC_brandBU_crspnd', cols_temp_UPC_brandBU_crspnd, set([1])) ''' SECTION 1 Compute the quantity weighted average price per unit & total quantity sold at the store-brand level ''' w_upc = sal.join(concatenated_DFs, temp_UPC_brandBU_crspnd, 'w_upc', ['upc'], ['upc']) w_avg_OZ_p = sal.divide(w_upc, 'w_avg_OZ_p', 'avg_OZ_p', ['avg_unit_p', 'size1_amount']) w_q_upd = sal.multiply(w_avg_OZ_p, 'w_q_upd', 'q', ['q', 'size1_amount']) brand_OZq_sum = sal.aggregate( w_q_upd, 'brand_OZq_sum', ['store_code_uc', 'brand_code_bu', 'week_end'], 'q', '+', 'brand_OZq') total_brnd_OZq = sal.join(w_q_upd, brand_OZq_sum, 'total_brnd_OZq', ['store_code_uc', 'brand_code_bu', 'week_end'], ['store_code_uc', 'brand_code_bu', 'week_end']) w_wghtd_OZ_brnd_p = sal.multiply(total_brnd_OZq, 'w_wghtd_OZ_brnd_p', 'wghtd_OZ_brnd_p', ['q', 'avg_OZ_p']) w_wghtd_OZ_brnd_p_final = sal.divide(w_wghtd_OZ_brnd_p, 'w_wghtd_OZ_brnd_p_final', 'wghtd_OZ_brnd_p', ['wghtd_OZ_brnd_p', 'brand_OZq']) brnd_p_sum = sal.aggregate(w_wghtd_OZ_brnd_p_final, 'brnd_p_sum', ['store_code_uc', 'brand_code_bu', 'week_end'], 'wghtd_OZ_brnd_p', '+', 'avg_OZ_brnd_p') result = sal.join(brnd_p_sum, w_wghtd_OZ_brnd_p_final, 'result', ['store_code_uc', 'brand_code_bu', 'week_end'], ['store_code_uc', 'brand_code_bu', 'week_end']) section_one_result = sal.project(result, 'section_one_result', [ "avg_OZ_brnd_p", "week_end", "store_code_uc", "brand_code_bu", "brand_descr_bu", "brand_OZq", 'retailer_code', 'store_zip3', 'q' ]) ''' SECTION 2 Compute the average price per OZ & total OZs sold for each brand at the retailer-$geo_unit level, by compiling the store level data that comprises each retailer-$geo_unit. Compute the total quantity sold by each retailer-$geo_unit ''' temp_sum = sal.aggregate( section_one_result, 'temp_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], 'brand_OZq', '+', 'brand_OZq') result_brnd_sum = sal.join( section_one_result, temp_sum, 'result_brnd_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end']) wghtd_p_mult = sal.multiply(result_brnd_sum, 'wghtd_p_mult', 'wghtd_p', ['brand_OZq', 'avg_OZ_brnd_p']) wghtd_p_final = sal.divide(wghtd_p_mult, 'wghtd_p_final', 'wghtd_p', ['wghtd_p', 'q']) wghtd_p_sum = sal.aggregate( wghtd_p_final, 'wghtd_p_sum', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], 'wghtd_p', '+', 'p') sec_4_result = sal.join( wghtd_p_final, wghtd_p_sum, 'sec_4_result', ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end'], ['store_zip3', 'retailer_code', 'brand_code_bu', 'week_end']) # TODO: filter out sec_4_result rows where 'store_zip3' cell is empty final = sal.project(sec_4_result, 'final', [ 'store_zip3', 'retailer_code', 'week_end', 'brand_code_bu', 'brand_descr_bu', 'q', 'p' ]) opened = sal.collect(final, 1) return set([concatenated_DFs, temp_UPC_brandBU_crspnd])