def time_merge_2intkey(self): result, orizer, intrizer, leftsorter, leftcount = pipeline_merge( self.left, self.right[2], slices=10, how="pipeline_merge") #print("*&%&%*$&%*$&*%") #print(leftsorter) #print(leftcount) #print("*&%&%*$&%*$&*%") for i in range(3, self.pieces): result, orizer, intrizer, leftsorter, leftcount = pipeline_merge( self.left, self.right[i], factorizer=orizer, intfactorizer=intrizer, leftsorter=leftsorter, leftcount=leftcount, slices=10, how="pipeline_merge")
def time_merge_2intkey(self): result = pipeline_merge(self.left, self.right, how="pipeline")
def run_command(key): pywren.wrenlogging.default_config('INFO') logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) logger.info("before everything") partition_num = key['partition_num'] rounds = key['rounds'] em = JiffyClient(host=key['em']) reduceId = key['taskId'] appName = key['appName'] alg_type = key['type'] data_ques1 = open_or_create_jiffy_queues(em, appName, partition_num, 1, 'receiver') logger.info("queue opened") names = key['names'] dtypes = key['dtypes'] ############# left table left_table = 100000 indices = tm.makeStringIndex(left_table).values key = np.tile(indices[:left_table], 1) left = DataFrame({"key": key, "value": np.random.randn(left_table)}) t_start = time.time() ############### initialize join functions # print(left) lim = 0 ############## keeps fetching fin_num = 0 if alg_type == 'pipelined': leftsorter = None leftcount = None orizer = None intrizer = None count = 0 while fin_num < partition_num and lim < 15: #### read table lim += 1 time.sleep(0.01) logger.info("before get") obj = data_ques1[0].get() if sys.getsizeof(obj) > 1000: part_data = pd.read_table(BytesIO(obj), header=None, delimiter="|", names=['key', 'value2']) # ds, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size = 1, fin_size = partition_num) logger.info(ds) logger.info(fin_num) # print(fin_num) if len(ds) > 0: ### join # start = timeit.default_timer() result, orizer, intrizer, leftsorter, leftcount = pipeline_merge( left, ds, factorizer=orizer, intfactorizer=intrizer, leftsorter=leftsorter, leftcount=leftcount, slices=8, how="pipeline") time.sleep(0.8) logger.info("merged") # end = timeit.default_timer() # count += (end - start) # logger.info(str(i) + " chunks take time " + str(end - start) + " Accum time: " + str(count)) elif alg_type == 'origin': ds = pd.DataFrame() while fin_num < partition_num and lim < 1500: lim += 1 #### read table dd, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size=1, fin_size=partition_num) if len(dd) > 0: ds = ds.append(dd) print("this is ds:") print(ds) result = merge(left, ds, how="inner") print(fin_num) t_fin = time.time() # share.append([t_start,t_fin, fin_num]) return ([t_fin, t_start])
'key': ['K5', 'K7', 'K5', 'K50'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'] }) right = pd.DataFrame({ 'key': ['K9', 'K23', 'K5', 'K7'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }) right2 = pd.DataFrame({ 'key': ['K0', 'K6', 'K7', 'K3'], 'C': ['C9', 'C1', 'C2', 'C8'], 'D': ['D9', 'D1', 'D2', 'D8'] }) result, objectrizer, intrizer, leftsorter, leftcount = pd.pipeline_merge( left, right, how='pipeline') print(result) result, objectrizer, intrizer, leftsorter, leftcount = pd.pipeline_merge( left, right2, factorizer=objectrizer, intfactorizer=intrizer, leftsorter=leftsorter, leftcount=leftcount, how='pipeline') print(result)
except ImportError: from pandas import ordered_merge as merge_ordered N = 10000000 pieces = 10 indices = tm.makeStringIndex(N).values indices2 = tm.makeStringIndex(N).values key = np.tile(indices[:500000], 1) key2 = np.tile(indices2[:500000], 1) left = DataFrame({"key": key, "value": np.random.randn(500000)}) right = {} right = DataFrame({ "key": indices[1 * 100000 + 50000:9 * 100000 + 50000], "value2": np.random.randn(800000), }) result = pipeline_merge(left, right, how="pipeline") # def time_merge_dataframe_integer_2key(self, sort): # pipeline_merge(self.df, self.df3, how="pipeline") # # def time_merge_dataframe_integer_key(self, sort): # pipeline_merge(self.df, self.df2, on="key1", how="pipeline") # class I8Merge: # # params = ["inner", "outer", "left", "right"] # param_names = ["how"] # # def setup(self, how): # low, high, n = -1000, 1000, 10 ** 6 # self.left = DataFrame(
print("\n") logger.info("Start Running test for pipelined pandas code") leftsorter = None leftcount = None orizer = None intrizer = None count = 0 for i in range(1, pieces): start = timeit.default_timer() result, orizer, intrizer, leftsorter, leftcount = pipeline_merge( left, right[i], factorizer=orizer, intfactorizer=intrizer, leftsorter=leftsorter, leftcount=leftcount, slices=ttt - 1, how="pipeline") end = timeit.default_timer() count += (end - start) logger.info( str(i) + " chunks take time " + str(end - start) + " Accum time: " + str(count)) #print("******* ", end - start) print( "--------------------------------------------------------------------------------" ) print("\n")
right = pd.DataFrame({ 'key': ['K0', 'K1', 'K2', 'K3', 'K5', 'K7'], 'C': ['C0', 'C1', 'C2', 'C3', 'C4', 'C5'], 'D': ['D0', 'D1', 'D2', 'D3', 'D4', 'D5'] }) right_2 = pd.DataFrame({ 'key': ['K5', 'K4', 'K3', 'K5', 'K6', 'K9'], 'C': ['CC0', 'CC1', 'CC2', 'CC3', 'CC4', 'CC5'], 'D': ['DD0', 'DD1', 'DD2', 'DD3', 'DD4', 'DD5'] }) left_full = pd.concat([left, left_2]) right_full = pd.concat([right, right_2]) result, orizer, intrizer, sorter, count = pd.pipeline_merge(left_full, right_full, slices=1, how="pipeline") print(result) keys, fac, intfac = pd.build_hash_table(left['key']) keys, fac, intfac = pd.build_hash_table(left_2['key'], factorizer=fac, intfactorizer=intfac, previous_keys=keys) print(left_full) print(keys) result, fac, intfac, leftsorter, leftcount = pd.pipeline_merge( left_full, right, factorizer=fac,