示例#1
0
 def test_inplace_dropping_single_row_in_column_distributed_dataframe(self):
     df = ParallelDataFrame(self.dict2, dist_data=False)
     self.assertEqual(df.globalShape, (3, 4))
     df.drop(2, axis=0, inplace=True)
     self.assertEqual(set(list(df.globalColumns)),
                      set(['key1', 'key2', 'key3', 'key4']))
     self.assertEqual(list(df.globalIndex), [0, 1])
示例#2
0
    def test_div_constant_distributed_df(self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)
        pd_df2 = self.pd_df1.div(10)
        df2 = ParallelDataFrame(pd_df2, dist_data=False)

        result = df1.div(10)
        self.assertTrue(result.equals(df2))
示例#3
0
 def test_inplace_dropping_multiple_columns_in_column_distributed_dataframe(
         self):
     df = ParallelDataFrame(self.dict3, dist_data=False)
     self.assertEqual(df.globalShape, (3, 8))
     df.drop(['key4', 'key8'], axis=1, inplace=True)
     self.assertEqual(set(list(df.globalColumns)),
                      set(['key1', 'key2', 'key3', 'key5', 'key6', 'key7']))
     self.assertEqual(list(df.globalIndex), [0, 1, 2])
示例#4
0
    def test_div_by_multiIndex_by_level_replicated_df(self):
        df = ParallelDataFrame(self.pd_df2, dist='replicated')
        rep_multindex = ParallelDataFrame(self.df_multindex, dist='replicated')

        result = df.div(rep_multindex, level=1, fill_value=0)

        pd_result = self.pd_df2.div(self.df_multindex, level=1, fill_value=0)
        self.assertTrue(result.equals(pd_result))
示例#5
0
    def test_replicated_df_apply_function_sqrt_returns_replicated_df(self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sqrt)
        result = df1.apply(np.sqrt)

        self.assertTrue(result.equals(pd_result))
        self.assertEqual(result.dist, 'replicated')
示例#6
0
 def test_non_inplace_dropping_multiple_columns_and_row_in_same_call_replicated_dataframe(
         self):
     df = ParallelDataFrame(self.dict3, dist='replicated')
     new_df = df.drop(columns=['key4', 'key7'], index=1, inplace=False)
     self.assertEqual(
         set(list(new_df.globalColumns)),
         set(['key1', 'key2', 'key3', 'key5', 'key6', 'key8']))
     self.assertEqual(list(new_df.globalIndex), [0, 2])
示例#7
0
 def test_non_inplace_dropping_single_column_in_column_distributed_dataframe(
         self):
     df = ParallelDataFrame(self.dict2, dist_data=False)
     self.assertEqual(df.globalShape, (3, 4))
     new_df = df.drop('key4', axis=1, inplace=False)
     self.assertEqual(set(list(new_df.globalColumns)),
                      set(['key1', 'key2', 'key3']))
     self.assertEqual(list(new_df.globalIndex), [0, 1, 2])
示例#8
0
    def test_corr_with_replicated_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        rep_df = ParallelDataFrame(pd_df, dist='replicated')

        rep_corr = rep_df.corr()
        pd_corr = pd_df.corr()

        self.assertTrue(rep_corr.equals(pd_corr))
示例#9
0
    def test_replicated_df_apply_function_list_like_result_returns_replicated_series(
            self):
        df = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1)
        result = df.apply(lambda x: [1, 2], axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))
示例#10
0
    def test_replicated_df_apply_function_sum_axis1_returns_replicated_series(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist='replicated')

        pd_result = self.pd_df1.apply(np.sum, axis=1)
        result = df1.apply(np.sum, axis=1)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'replicated')
        self.assertTrue(result.equals(pd_result))
示例#11
0
    def test_column_distributed_df_apply_function_sqrt_returns_distributed_df(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)
        result = df1.apply(np.sqrt)
        df3 = result.apply(np.square)

        self.assertTrue(isinstance(result, ParallelDataFrame))
        self.assertEqual(result.dist, 'distributed')
        self.assertFalse(result.equals(df1))
        self.assertTrue(df1.equals(df3))
示例#12
0
    def test_slicing_with_slice_object_getting_dist_df_in_column_distributed_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        dist_df = ParallelDataFrame(data=d, dist_data=False)

        dist_slice = dist_df.loc[0:1]
        pd_slice = pd_df.loc[0:1]
        pd_slice_dist = ParallelDataFrame(data=pd_slice, dist_data=False)

        self.assertTrue(isinstance(dist_slice, ParallelDataFrame))
        self.assertEqual(dist_slice.dist, 'distributed')
        self.assertTrue(dist_slice.equals(pd_slice_dist))
示例#13
0
    def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_False(
            self):
        df1 = ParallelDataFrame(self.pd_df1, dist_data=False)

        pd_result = self.pd_df1.apply(np.sum, axis=0, raw=False)
        result = df1.apply(np.sum, axis=0, raw=False)

        self.assertTrue(isinstance(result, ParallelSeries))
        self.assertEqual(result.dist, 'distributed')
        self.assertEqual(set(list(result.globalIndex)),
                         set(list(pd_result.index)))
        self.assertTrue(result.collect().sort_index().equals(
            pd_result.sort_index()))
示例#14
0
    def test_corr_with_col_distributed_dataframe(self):
        pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6),
                              (.2, .1, .1)],
                             columns=['dogs', 'cats', 'rats'])
        dist_df = ParallelDataFrame(pd_df, dist_data=False)

        dist_corr = dist_df.corr()
        pd_corr = pd_df.corr()

        #compare values of each row (rounded to 6 digits)
        for row in dist_corr.globalIndex:
            self.assertEqual(
                list(dist_corr.loc[row].collect().sort_index().round(6)),
                list(pd_corr.loc[row].sort_index().round(6)))
示例#15
0
 def test_distributed_df_creation_with_from_dict_function_orient_index(
         self):
     df = ParallelDataFrame.from_dict(self.dict2, orient='index')
     self.assertEqual(df.globalShape, (4, 3))
     self.assertEqual(set(list(df.globalIndex)),
                      set(['key1', 'key2', 'key3', 'key4']))
     self.assertEqual(list(df.globalColumns), [0, 1, 2])
示例#16
0
 def test_inplace_dropping_single_row_in_index_distributed_dataframe(self):
     df = ParallelDataFrame.from_dict(self.dict2, orient='index')
     self.assertEqual(df.globalShape, (4, 3))
     df.drop('key4', axis=0, inplace=True)
     self.assertEqual(set(list(df.globalIndex)),
                      set(['key1', 'key2', 'key3']))
     self.assertEqual(list(df.globalColumns), [0, 1, 2])
示例#17
0
    def test_replicated_df_creation_with_from_dict_function_orient_index(self):
        pd_df = pd.DataFrame.from_dict(self.dict2, orient='index')
        df = ParallelDataFrame.from_dict(self.dict2,
                                         orient='index',
                                         dist='replicated')

        self.assertTrue(df.equals(pd_df))
示例#18
0
    def test_replicated_df_creation_with_constructor_input_dictionary(self):
        df = pd.DataFrame(self.dict1)
        rep_df = ParallelDataFrame(self.dict1, dist='replicated')

        self.assertEqual(df.shape, rep_df.shape)
        self.assertTrue(isinstance(rep_df, ParallelDataFrame))
        self.assertEqual(rep_df.dist, 'replicated')
示例#19
0
    def test_distributed_df_creation_with_constructor_input_dataframe(self):
        df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
        dist_df = ParallelDataFrame(df, dist_data=False)

        self.assertTrue(isinstance(dist_df, ParallelDataFrame))
        self.assertEqual(dist_df.dist, 'distributed')
        self.assertEqual(dist_df.globalShape, df.shape)
        self.assertNotEqual(dist_df.shape, dist_df.globalShape)
示例#20
0
 def test_globalIndex_replicated_series(self):
   pd_df = pd.DataFrame(self.dict3)
   pd_series = pd_df.loc[1]
   
   dist_df = ParallelDataFrame(data= self.dict3, dist = 'replicated')
   dist_series = dist_df.loc[1]
   
   self.assertEqual(set(dist_series.globalIndex), set(pd_series.index))
示例#21
0
 def test_global_to_local_distributed_series(self):
   pd_df = pd.DataFrame(self.dict3)
   pd_series = pd_df.loc[1]
   
   dist_df = ParallelDataFrame(data= self.dict3, dist_data = False)
   dist_series = dist_df.loc[1]
  
   self.assertEqual(set(dist_series.global_to_local.keys()), set(pd_series.index))
示例#22
0
    def test_slicing_getting_cell_value_in_replicated_df(self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[1, 'col2']
        pd_series = pd_df.loc[1, 'col2']

        self.assertEqual(rep_series, pd_series)
示例#23
0
    def test_slicing_with_boolean_array_getting_rep_df_from_replicated_df(
            self):
        d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[[True, False, False, True]]
        pd_series = pd_df.loc[[True, False, False, True]]

        self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index()))
示例#24
0
 def test_creating_distributed_series_by_gettting_row_from_column_distributed_dataframe(self):  
   dist_df = ParallelDataFrame(data=self.d, dist_data = False)
   parallel_series = dist_df.loc[1]
   
   pd_df = pd.DataFrame(self.d)
   pd_series = pd_df.loc[1]
   
   self.assertTrue(isinstance(parallel_series, ParallelSeries))
   self.assertEqual(parallel_series.dist, "distributed")
   
   self.assertTrue(parallel_series.collect().sort_index().equals(pd_series.sort_index()))
示例#25
0
 def test_value_count_with_distributed_series_and_string_data(self):
   pd_df = pd.DataFrame(self.np_array2)
   pd_series = pd_df.loc[0]
   
   dist_df = ParallelDataFrame(pd_df, dist_data = False)
   dist_series = dist_df.loc[0]
   pd_series_vc = pd_series.value_counts()
   dist_series_vc = dist_series.value_counts()
   
   self.assertTrue(dist_series_vc.dist, 'replicated')
   self.assertTrue(dist_series_vc.sort_index().equals(pd_series_vc.sort_index()))
示例#26
0
    def test_slicing_with_list_of_labels_getting_rep_df_from_replicated_df(
            self):
        d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_slice = rep_df.loc[[0, 3]]
        pd_slice = pd_df.loc[[0, 3]]

        self.assertTrue(isinstance(rep_slice, ParallelDataFrame))
        self.assertEqual(rep_slice.dist, 'replicated')
        self.assertTrue(rep_slice.sort_index().equals(pd_slice.sort_index()))
示例#27
0
    def test_slicing_with_single_label_getting_rep_series_from_replicated_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        rep_df = ParallelDataFrame(data=d, dist='replicated')

        rep_series = rep_df.loc[1]
        pd_series = pd_df.loc[1]

        self.assertTrue(isinstance(rep_series, ParallelSeries))
        self.assertEqual(rep_series.dist, 'replicated')
        self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index()))
示例#28
0
    def test_slicing_with_single_label_getting_dist_series_from_column_distributed_df(
            self):
        d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]}
        pd_df = pd.DataFrame(data=d)
        dist_df = ParallelDataFrame(data=d, dist_data=False)

        dist_series = dist_df.loc[1]
        pd_series = pd_df.loc[1]

        self.assertTrue(isinstance(dist_series, ParallelSeries))
        self.assertEqual(dist_series.dist, 'distributed')
        self.assertTrue(dist_series.collect().sort_index().equals(
            pd_series.sort_index()))
示例#29
0
 def test_value_count_with_distributed_series_and_float_data(self):
   pd_df = pd.DataFrame(self.dict3)
   pd_series = pd_df.loc[1]
   
   dist_df = ParallelDataFrame(data= self.dict3, dist_data = False)
   dist_series = dist_df.loc[1]
   
   pd_series_vc = pd_series.value_counts()
   dist_series_vc = dist_series.value_counts()
   #convert the indices to string (that is what the parallelPandas returns)
   pd_series_vc.index = pd_series_vc.index.map(str)
   
   self.assertTrue(dist_series_vc.dist, 'replicated')
   self.assertTrue(dist_series_vc.sort_index().equals(pd_series_vc.sort_index()))
示例#30
0
 def test_global_to_local_replicated_series(self):
   dist_df = ParallelDataFrame(data= self.dict3, dist = 'replicated')
   
   dist_series = dist_df.loc[1]
  
   self.assertRaises(ValueError, dist_series.find_global_to_local, )