Пример #1
0
    def test_corr_with_replicated_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        rep_df = ParallelDataFrame(pd_df, dist='replicated')

        rep_corr = rep_df.corr()
        pd_corr = pd_df.corr()

        self.assertTrue(rep_corr.equals(pd_corr))
Пример #2
0
    def test_corr_with_col_distributed_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        dist_df = ParallelDataFrame(pd_df, dist_data=False)

        dist_corr = dist_df.corr()
        pd_corr = pd_df.corr()

        #compare values of each row (rounded to 6 digits)
        for row in dist_corr.globalIndex:
            self.assertEqual(
                list(dist_corr.loc[row].collect().sort_index().round(6)),
                list(pd_corr.loc[row].sort_index().round(6)))
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--stats_file',
        help="Name and location of file where the stats should be saved")
    args = parser.parse_args()
    if args.stats_file:
        file_stats = args.stats_file
    else:
        file_stats = "./microbenchmark_stats.txt"

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    number_processors = comm.Get_size()

    data_folder1 = './MPIpandas/examples/microbenchmarks/test_data1_microbenchmarks'
    data_folder2 = './MPIpandas/examples/microbenchmarks/test_data2_microbenchmarks'

    #if file does not exist add header
    if rank == 0 and os.path.isfile(file_stats) != True:
        with open(file_stats, 'a') as file:
            file.write("#Processors, Function, Time(ms)\n")

    #---------------------------------------------------------------------------------------
    #value_counts function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_vc_transpose_40K.data",
                          low_memory=False)
    #pd_data = pd.read_csv(data_folder2+"/adult_vc_transpose_80K.data", low_memory = False)
    pd_data.index = ['workclass', 'education', 'educationNum']
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    dist_data = dist_data.apply(lambda x: x.str.lower()
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace('-', '')
                                if x.dtype == 'object' else x)
    dist_data = dist_data.apply(lambda x: x.str.replace(' ', '')
                                if x.dtype == 'object' else x)
    dist_series = dist_data.loc['education']
    comm.Barrier()
    t0 = time.time()
    dist_series.value_counts()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, value_counts, {}\n'.format(
                number_processors, (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #apply function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dist_data = ParallelDataFrame(pd_data, dist_data=False)

    comm.Barrier()
    t0 = time.time()
    new_dist_data = dist_data.apply(lambda x: x.str.lower()
                                    if x.dtype == 'object' else x)
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, apply, {}\n'.format(number_processors,
                                                (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #from_dict function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_80K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_163K.data")
    pd_data.columns = [
        'age', 'workclass', 'fnlwgt', 'education', 'educationNum',
        'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
        'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry',
        'payPerYear'
    ]
    dict = pd_data.to_dict()

    comm.Barrier()
    t0 = time.time()
    dist_data = ParallelDataFrame.from_dict(dict, orient='index')
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, from_dict, {}\n'.format(number_processors,
                                                    (time.time() - t0) * 1000))

    #---------------------------------------------------------------------------------------
    #corr function------------------------------------------------------------------
    #---------------------------------------------------------------------------------------
    pd_data = pd.read_csv(data_folder1 + "/adult_discretized_82K.data")
    #pd_data = pd.read_csv(data_folder2+"/adult_discretized_164K.data")
    dist_data = ParallelDataFrame(pd_data)

    comm.Barrier()
    t0 = time.time()
    dist_data.corr()
    comm.barrier()
    if (rank == 0):
        with open(file_stats, 'a') as file:
            file.write('{}, corr, {}\n'.format(number_processors,
                                               (time.time() - t0) * 1000))