Exemplo n.º 1
0
    def test_replicated_df_apply_function_sqrt_returns_replicated_df(self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sqrt)
        result = df1.apply(np.sqrt)

        self.assertTrue(result.equals(pd_result))
        self.assertEqual(result.dist, 'replicated')
Exemplo n.º 2
0
    def test_replicated_df_apply_function_list_like_result_returns_replicated_series(
            self):
        df = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1)
        result = df.apply(lambda x: [1, 2], axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))
Exemplo n.º 3
0
    def test_replicated_df_apply_function_sum_axis1_returns_replicated_series(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sum, axis=1)
        result = df1.apply(np.sum, axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))
Exemplo n.º 4
0
    def test_column_distributed_df_apply_function_sqrt_returns_distributed_df(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)
        result = df1.apply(np.sqrt)
        df3 = result.apply(np.square)

        self.assertTrue(isinstance(result, ParallelDataFrame))
        self.assertEqual(result.dist, 'distributed')
        self.assertFalse(result.equals(df1))
        self.assertTrue(df1.equals(df3))
Exemplo n.º 5
0
    def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_False(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)

        pd_result = self.pd_df1.apply(np.sum, axis=0, raw=False)
        result = df1.apply(np.sum, axis=0, raw=False)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'distributed')
        self.assertEqual(set(list(result.globalIndex)),
                         set(list(pd_result.index)))
        self.assertTrue(result.collect().sort_index().equals(
            pd_result.sort_index()))
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--stats_file',
        help="Name and location of file where the stats should be saved")
    args = parser.parse_args()
    if args.stats_file:
        file_stats = args.stats_file
    else:
        file_stats = "./microbenchmark_stats.txt"

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    number_processors = comm.Get_size()

    data_folder1 = './MPIpandas/examples/microbenchmarks/test_data1_microbenchmarks'
    data_folder2 = './MPIpandas/examples/microbenchmarks/test_data2_microbenchmarks'

    #if file does not exist add header
    if rank == 0 and os.path.isfile(file_stats) != True:
        with open(file_stats, 'a') as file:
            file.write("#Processors, Function, Time(ms)\n")

    #---------------------------------------------------------------------------------------
    #value_counts function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_vc_transpose_40K.data",
                          low_memory=False)
    #pd_data = pd.read_csv(data_folder2+"/adult_vc_transpose_80K.data", low_memory = False)
    pd_data.index = ['workclass', 'education', 'educationNum']
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    dist_data = dist_data.apply(lambda x: x.str.lower()
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace('-', '')
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace(' ', '')
                                if x.dtype == 'object' else x)
    dist_series = dist_data.loc['education']
    comm.Barrier()
    t0 = time.time()
    dist_series.value_counts()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, value_counts, {}\n'.format(
                number_processors, (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #apply function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    comm.Barrier()
    t0 = time.time()
    new_dist_data = dist_data.apply(lambda x: x.str.lower()
                                    if x.dtype == 'object' else x)
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, apply, {}\n'.format(number_processors,
                                                (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #from_dict function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dict = pd_data.to_dict()

    comm.Barrier()
    t0 = time.time()
    dist_data = ParallelDataFrame.from_dict(dict, orient='index')
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, from_dict, {}\n'.format(number_processors,
                                                    (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #corr function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_discretized_82K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_discretized_164K.data")
    dist_data = ParallelDataFrame(pd_data)

    comm.Barrier()
    t0 = time.time()
    dist_data.corr()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, corr, {}\n'.format(number_processors,
                                               (time.time() - t0) * 1000))