def test_replicated_df_apply_function_sqrt_returns_replicated_df(self): df1 = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(np.sqrt) result = df1.apply(np.sqrt) self.assertTrue(result.equals(pd_result)) self.assertEqual(result.dist, 'replicated')
def test_replicated_df_apply_function_list_like_result_returns_replicated_series( self): df = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1) result = df.apply(lambda x: [1, 2], axis=1) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result))
def test_replicated_df_apply_function_sum_axis1_returns_replicated_series( self): df1 = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(np.sum, axis=1) result = df1.apply(np.sum, axis=1) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result))
def test_column_distributed_df_apply_function_sqrt_returns_distributed_df( self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) result = df1.apply(np.sqrt) df3 = result.apply(np.square) self.assertTrue(isinstance(result, ParallelDataFrame)) self.assertEqual(result.dist, 'distributed') self.assertFalse(result.equals(df1)) self.assertTrue(df1.equals(df3))
def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_False( self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) pd_result = self.pd_df1.apply(np.sum, axis=0, raw=False) result = df1.apply(np.sum, axis=0, raw=False) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'distributed') self.assertEqual(set(list(result.globalIndex)), set(list(pd_result.index))) self.assertTrue(result.collect().sort_index().equals( pd_result.sort_index()))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--stats_file', help="Name and location of file where the stats should be saved") args = parser.parse_args() if args.stats_file: file_stats = args.stats_file else: file_stats = "./microbenchmark_stats.txt" comm = MPI.COMM_WORLD rank = comm.Get_rank() number_processors = comm.Get_size() data_folder1 = './MPIpandas/examples/microbenchmarks/test_data1_microbenchmarks' data_folder2 = './MPIpandas/examples/microbenchmarks/test_data2_microbenchmarks' #if file does not exist add header if rank == 0 and os.path.isfile(file_stats) != True: with open(file_stats, 'a') as file: file.write("#Processors, Function, Time(ms)\n") #--------------------------------------------------------------------------------------- #value_counts function------------------------------------------------------------------ #--------------------------------------------------------------------------------------- pd_data = pd.read_csv(data_folder1 + "/adult_vc_transpose_40K.data", low_memory=False) #pd_data = pd.read_csv(data_folder2+"/adult_vc_transpose_80K.data", low_memory = False) pd_data.index = ['workclass', 'education', 'educationNum'] dist_data = ParallelDataFrame(pd_data, dist_data=False) dist_data = dist_data.apply(lambda x: x.str.lower() if x.dtype == 'object' else x) dist_data = dist_data.apply(lambda x: x.str.replace('-', '') if x.dtype == 'object' else x) dist_data = dist_data.apply(lambda x: x.str.replace(' ', '') if x.dtype == 'object' else x) dist_series = dist_data.loc['education'] comm.Barrier() t0 = time.time() dist_series.value_counts() comm.barrier() if (rank == 0): with open(file_stats, 'a') as file: file.write('{}, value_counts, {}\n'.format( number_processors, (time.time() - t0) * 1000)) #--------------------------------------------------------------------------------------- #apply function------------------------------------------------------------------ #--------------------------------------------------------------------------------------- pd_data = pd.read_csv(data_folder1 + "/adult_80K.data") #pd_data = pd.read_csv(data_folder2+"/adult_163K.data") pd_data.columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'payPerYear' ] dist_data = ParallelDataFrame(pd_data, dist_data=False) comm.Barrier() t0 = time.time() new_dist_data = dist_data.apply(lambda x: x.str.lower() if x.dtype == 'object' else x) comm.barrier() if (rank == 0): with open(file_stats, 'a') as file: file.write('{}, apply, {}\n'.format(number_processors, (time.time() - t0) * 1000)) #--------------------------------------------------------------------------------------- #from_dict function------------------------------------------------------------------ #--------------------------------------------------------------------------------------- pd_data = pd.read_csv(data_folder1 + "/adult_80K.data") #pd_data = pd.read_csv(data_folder2+"/adult_163K.data") pd_data.columns = [ 'age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'payPerYear' ] dict = pd_data.to_dict() comm.Barrier() t0 = time.time() dist_data = ParallelDataFrame.from_dict(dict, orient='index') comm.barrier() if (rank == 0): with open(file_stats, 'a') as file: file.write('{}, from_dict, {}\n'.format(number_processors, (time.time() - t0) * 1000)) #--------------------------------------------------------------------------------------- #corr function------------------------------------------------------------------ #--------------------------------------------------------------------------------------- pd_data = pd.read_csv(data_folder1 + "/adult_discretized_82K.data") #pd_data = pd.read_csv(data_folder2+"/adult_discretized_164K.data") dist_data = ParallelDataFrame(pd_data) comm.Barrier() t0 = time.time() dist_data.corr() comm.barrier() if (rank == 0): with open(file_stats, 'a') as file: file.write('{}, corr, {}\n'.format(number_processors, (time.time() - t0) * 1000))