def create_inverted_index(dataFrame, total_words, top_words): inverted_index = dpd.ParallelDataFrame() pandas_version = get_pandas_version() i = 0 for onefile in dataFrame.columns: if (dataFrame[onefile].isnull().all()): word_weights_per_file = dpd.ParallelDataFrame(np.nan, index=['##'], columns=[onefile]) else: word_weights_per_file = dpd.ParallelDataFrame( dataFrame[onefile].value_counts()) word_weights_per_file.index.name = "words" if (total_words[onefile] != 0): word_weights_per_file = word_weights_per_file.div( total_words[onefile]) #else: # word_weights_per_file = dpd.DistributedDataFrame(np.nan, index = ['##'], columns = [onefile]) #if (inverted_index.empty): # inverted_index = word_weights_per_file #else: if (pandas_version >= 0.23): inverted_index = pd.concat([inverted_index, word_weights_per_file], axis=1, sort=False) else: inverted_index = pd.concat([inverted_index, word_weights_per_file], axis=1) #, sort = False) i += 1 del word_weights_per_file inverted_index.columns = dataFrame.columns if '##' in inverted_index.index: inverted_index.drop('##', axis=0, inplace=True) inverted_index.fillna(np.float32(0), inplace=True) pandas_version = get_pandas_version() #makes sure to include all top words in inverted index if (pandas_version >= 0.23): inverted_index = pd.concat([top_words, inverted_index], axis=1, sort=False) else: inverted_index = pd.concat([top_words, inverted_index], axis=1) inverted_index.fillna(np.float32(0), inplace=True) return inverted_index
def read_files(path_input): comm = MPI.COMM_WORLD rank = comm.Get_rank() file_name_and_text = parallelIO.read_all(path_input, return_type='dict') pandas_version = get_pandas_version() #create df if (pandas_version >= 0.23): dataFrame = pd.DataFrame.from_dict( file_name_and_text, orient='index', columns=['text']) #.rename(columns ={0:'text'})#\ else: dataFrame = pd.DataFrame.from_dict(file_name_and_text, orient='index').rename( columns={0: 'text'}) #\ dataFrame.index.name = "filename" #dataFrame.index = ['text'] if (dataFrame is None or dataFrame.empty): print( "ERROR: Either the directory does not exist!\nOR no documents in the directory!\nOR reduce the number of processors!" ) exit_program() return dataFrame
def loc(self): from .ParallelPandasUtils import _CustomLocIndexer pandas_version = get_pandas_version() if (pandas_version >= 0.25): return _CustomLocIndexer("loc", self) else: return _CustomLocIndexer(self, name='loc')
def __recv_and_process_for_corr(self, source, number_processors, rank, df, method, min_periods, output): """ Helper routine for the corelation function, performs a blocking receive, processes the received data, and appends ot the dataframe that it receives as an input parameter Returns: a dataframe """ pandas_version = get_pandas_version() if source >= 0 and source < number_processors: tag = source + rank recv_shape = np.zeros(2, dtype=np.int) self.comm.Recv([recv_shape, 2, MPI.LONG], source=source, tag=tag + 100) data_labels = [] #file names data_labels = self.comm.recv(source=source, tag=tag + 10) recv_data = np.zeros([recv_shape[0], recv_shape[1]], dtype=np.float32) self.comm.Recv( [recv_data, recv_shape[0] * recv_shape[1], MPI.FLOAT], source=source, tag=tag) #create a temp_df and find corelation with new files and then concat with output (corelation matrix) temp_df = pd.DataFrame() i = 0 # create a df from the data received for a_label in data_labels: temp_df[a_label] = recv_data[:, i] i += 1 del recv_data #add to local df for column in df.columns: temp_df[column] = df[column].values #-----can be done without the 2 lines below temp_df['index'] = df.index.values temp_df.set_index('index', inplace=True) temp_output = temp_df.corr(method=method, min_periods=min_periods) if (pandas_version >= 0.23): output = output.append(temp_output, sort=False) #merge duplicate rows if any output = output.groupby(output.index, sort=False, axis=0).min(axis=1, skipna=True) else: output = output.append(temp_output) # merge duplicate rows if any output = output.groupby(output.index, sort=False, axis=0).min() del temp_output #drop duplicate-data tht exists on other processors (comparison of docs in a processor) for a_label in data_labels: if a_label in output.index: output.drop(a_label, axis=0, inplace=True) return output
def from_dict(cls, data, orient="columns", columns=None, dtype=None, comm=MPI.COMM_WORLD, dist='distributed') -> "ParallelDataFrame": """ Class method that can create a dataframe from a dictionary based on the specified orientation (index or columns) Both columns and index orientation is supported """ if (dist == 'distributed'): distributed_data = ParallelDataFrame._get_distributed_data( data, orient, columns, dtype, comm) return cls(data=distributed_data, comm=comm, dtype=dtype, dist_data=True, orient=orient) else: pandas_version = get_pandas_version() if pandas_version >= 0.23: dataFrame = pd.DataFrame.from_dict(data, orient=orient, columns=columns, dtype=dtype) else: dataFrame = pd.DataFrame.from_dict(data, orient=orient, dtype=dtype) return cls(data=dataFrame, comm=comm, dtype=dtype, dist='replicated')
def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ Function to apply a function along an axis of the ParallelDataFrame Return: Returns a parallel dataframe or paralle series which is a result of applying the function """ pandas_version = get_pandas_version() if (pandas_version >= 0.23): super_return = super().apply(func, axis=axis, raw=raw, result_type=result_type, args=args) else: super_return = super().apply(func, axis=axis, raw=raw, args=args) #dist_data = True if self.dist == 'distributed' else False if (isinstance(super_return, pd.Series)): return self.__constructor_sliced( data=super_return, dist=self.dist, comm=self.comm) #, dist_data = dist_data) elif (isinstance(super_return, pd.DataFrame)): return self.__constructor( data=super_return, dist=self.dist, comm=self.comm, orient=self.orient) #, dist_data = dist_data)
def __sort_series_by_value(self, ascending, inplace): pandas_version = get_pandas_version() if (pandas_version >= 0.17): return self.sort_values(axis=0, ascending=ascending, inplace=inplace) # else: return self.sort(axis=0, ascending=ascending, inplace=inplace) # for pandas 0.16
def print_pairs(similarity_matrix, file_path, pair_number): pandas_version = get_pandas_version() for col_label, row in similarity_matrix.items(): row.dropna(inplace=True) for row_label, content in row.items(): to_print = "{} {} {}\t\t\t\t {} \n".format(pair_number, col_label, row_label, content) print_to_file(to_print, file_path) pair_number += 1 return pair_number
def _get_distributed_data(data, orient, columns, dtype, comm): """ Helper routine to distribute dictionary data uniformly """ no_of_rows, keys, indices = ParallelDataFrame.__dictionary_info(data, comm) key_and_value = {} for i in indices: key_and_value[keys[i]] = data[keys[i]]#[)]? pandas_version = get_pandas_version() if pandas_version >= 0.23: dataFrame = pd.DataFrame.from_dict(key_and_value, orient = orient, columns = columns, dtype = dtype) else: dataFrame = pd.DataFrame.from_dict(key_and_value, orient = orient, dtype = dtype) return dataFrame
def read_files(path): file_name_and_text = {} for filename in os.listdir(path): with open(path + filename, "rb") as myfile: file_name_and_text[filename] = [str(myfile.read())] pandas_version = get_pandas_version() if pandas_version >= 0.23: dataFrame = pd.DataFrame.from_dict(file_name_and_text, orient='columns') else: dataFrame = pd.DataFrame.from_dict(file_name_and_text, orient='columns') dataFrame.index = ['text'] return dataFrame
def create_inverted_index(dataFrame, total_words, top_words): #create inverted index (row labels = words, column labels = filenames) inverted_index = serial.create_inverted_index(dataFrame, total_words, top_words) pandas_version = get_pandas_version() #makes sure to include all top words in inverted index if (pandas_version >= 0.23): inverted_index = pd.concat([top_words, inverted_index], axis=1, sort=False) else: inverted_index = pd.concat([top_words, inverted_index], axis=1) inverted_index.fillna(np.float32(0), inplace=True) return inverted_index
def get_similar_documents(similarity_matrix): pandas_version = get_pandas_version() if (pandas_version >= 0.17): max_sorted = pd.DataFrame( similarity_matrix.max(skipna=True)).rename(columns={ 0: "score" }).sort_values( by="score", ascending=False, inplace=False ) #sort_values for 0.17 and higher pandas, sot_index otherwise else: max_sorted = pd.DataFrame( similarity_matrix.max(skipna=True)).rename(columns={ 0: "score" }).sort_index(by="score", ascending=False, inplace=False) #sort_index otherwise max_sorted.index.name = "filename" max_sorted.fillna(inplace=True, value=-10) return max_sorted
def sort_df(df, col_name): pandas_version = get_pandas_version() if (pandas_version >= 0.17): return df.sort_values(by=col_name, ascending=False) else: return df.sort(col_name, ascending=False)
def create_similarity_matrix(inverted_index): comm = MPI.COMM_WORLD rank = comm.Get_rank() number_processors = comm.Get_size() pandas_version = get_pandas_version() similarity_matrix = serial.create_similarity_matrix(inverted_index) to_send = np.array(inverted_index.values, order='C', dtype=np.float32) #sending contiguous array to_send_shape = np.array( [inverted_index.shape[0], inverted_index.shape[1]]) req = [] #each rank will 1st send to one up and below, then 2 up and below and so on alternatively...until the last one #creates a distributed corr dataframe where data is divided by columns of dataframe for step in range(1, number_processors): #send destination1 = rank + step destination2 = rank - step #send to one below req = _send_for_simi_matrix(to_send, inverted_index.columns, to_send_shape, destination1, number_processors, req, inverted_index.shape, rank, comm) #send to one up req = _send_for_simi_matrix(to_send, inverted_index.columns, to_send_shape, destination2, number_processors, req, inverted_index.shape, rank, comm) #receive source1 = rank - step source2 = rank + step method = 'pearson' min_periods = 1 #block receive and process from one above similarity_matrix = _recv_and_process_for_simi_matrix( source1, number_processors, rank, inverted_index, method, min_periods, similarity_matrix, comm) #block receive and process from one below similarity_matrix = _recv_and_process_for_simi_matrix( source2, number_processors, rank, inverted_index, method, min_periods, similarity_matrix, comm) #wait for all the sends to complete if (len(req) != 0): MPI.Request.Waitall(req) req = [] del to_send del to_send_shape #output = output.transpose() # to have column distribution #sort so that all the rows are arragend similarly in various processors #similarity_matrix.sort_index(inplace = True) similarity_matrix = similarity_matrix.transpose() #removing self comparison for files for i in similarity_matrix.columns.values: similarity_matrix[i].loc[i] = np.nan similarity_matrix.dropna(axis=1, how='all', inplace=True) similarity_matrix.dropna(axis=0, how='all', inplace=True) return similarity_matrix
class ParallelDataFrameTest(unittest.TestCase): def setUp(self): self.dict1 = { 'key1': [10, 11, 22], 'key2': [23, 34, 56], 'key3': [1, 2, 3] } self.dict2 = { 'key1': [10, 11, 22], 'key2': [23, 34, 56], 'key3': [1, 2, 3], 'key4': [29, 38, 47] } self.dict3 = { 'key1': [10, 11, 22], 'key2': [23, 34, 56], 'key3': [1, 2, 3], 'key4': [29, 38, 47], 'key5': [10, 11, 22], 'key6': [23, 34, 56], 'key7': [1, 2, 3], 'key8': [29, 38, 47] } self.pd_df1 = pd.DataFrame([[4.0, 9.0, 16.0, 25.0, 36.0]] * 5, columns=['A', 'B', 'C', 'D', 'E']) self.pd_df2 = pd.DataFrame( { 'angles': [0, 3, 4], 'degrees': [360, 180, 360], 'equalsides': [0, 3, 2] }, index=['circle', 'triangle', 'rectangle']) self.df_multindex = pd.DataFrame( { 'angles': [0, 3, 4, 4, 5, 6], 'degrees': [360, 180, 360, 360, 540, 720], 'equalsides': [0, 3, 2, 4, 5, 6] }, index=[['A', 'A', 'A', 'B', 'B', 'B'], [ 'circle', 'triangle', 'rectangle', 'square', 'pentagon', 'hexagon' ]]) def test_canary(self): self.assertTrue(True) #Testing constructor------------------------------- def test_creation_of_empty_parallel_dataframe(self): df = ParallelDataFrame() self.assertTrue(isinstance(df, ParallelDataFrame)) self.assertTrue(df.empty) def test_replicated_df_creation_with_constructor_input_dictionary(self): df = pd.DataFrame(self.dict1) rep_df = ParallelDataFrame(self.dict1, dist='replicated') self.assertEqual(df.shape, rep_df.shape) self.assertTrue(isinstance(rep_df, ParallelDataFrame)) self.assertEqual(rep_df.dist, 'replicated') def test_distributed_df_creation_with_constructor_input_dictionary(self): df = ParallelDataFrame(self.dict2, dist_data=False) self.assertEqual(df.globalShape, (3, 4)) self.assertEqual(df.dist, "distributed") def test_distributed_df_creation_with_constructor_input_dataframe(self): df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]])) dist_df = ParallelDataFrame(df, dist_data=False) self.assertTrue(isinstance(dist_df, ParallelDataFrame)) self.assertEqual(dist_df.dist, 'distributed') self.assertEqual(dist_df.globalShape, df.shape) self.assertNotEqual(dist_df.shape, dist_df.globalShape) #Testing from_dict function---------------------------------- def test_distributed_df_creation_with_from_dict_function_orient_index( self): df = ParallelDataFrame.from_dict(self.dict2, orient='index') self.assertEqual(df.globalShape, (4, 3)) self.assertEqual(set(list(df.globalIndex)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalColumns), [0, 1, 2]) def test_distributed_df_creation_with_from_dict_function_orient_columns( self): df = ParallelDataFrame.from_dict(self.dict2, orient='columns') self.assertEqual(df.globalShape, (3, 4)) self.assertEqual(set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalIndex), [0, 1, 2]) def test_replicated_df_creation_with_from_dict_function_orient_index(self): pd_df = pd.DataFrame.from_dict(self.dict2, orient='index') df = ParallelDataFrame.from_dict(self.dict2, orient='index', dist='replicated') self.assertTrue(df.equals(pd_df)) def test_replicated_df_creation_with_from_dict_function_orient_columns( self): pd_df = pd.DataFrame.from_dict(self.dict2, orient='columns') df = ParallelDataFrame.from_dict(self.dict2, orient='columns', dist='replicated') self.assertTrue(df.equals(pd_df)) #Testing global_to_local property--------------------------------- def test_global_to_local_functionality_with_column_distribution(self): df = ParallelDataFrame(self.dict2, dist_data=False) self.assertTrue(isinstance(df.global_to_local, dict)) self.assertEqual(len(df.global_to_local), 4) self.assertEqual(set(list(df.global_to_local.keys())), set(['key1', 'key2', 'key3', 'key4'])) def test_global_to_local_functionality_with_index_distribution(self): df = ParallelDataFrame.from_dict(self.dict2, orient='index') self.assertTrue(isinstance(df.global_to_local, dict)) self.assertEqual(len(df.global_to_local), 4) self.assertEqual(set(list(df.global_to_local.keys())), set(['key1', 'key2', 'key3', 'key4'])) #Testing 'drop' function--------------------------------------- def test_inplace_dropping_multiple_columns_in_column_distributed_dataframe( self): df = ParallelDataFrame(self.dict3, dist_data=False) self.assertEqual(df.globalShape, (3, 8)) df.drop(['key4', 'key8'], axis=1, inplace=True) self.assertEqual(set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key5', 'key6', 'key7'])) self.assertEqual(list(df.globalIndex), [0, 1, 2]) if (get_pandas_version() >= 0.21): def test_inplace_dropping_multiple_columns_in_column_distributed_dataframe_specifying_columns( self): df = ParallelDataFrame(self.dict3, dist_data=False) self.assertEqual(df.globalShape, (3, 8)) df.drop(columns=['key4', 'key8'], inplace=True) self.assertEqual( set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key5', 'key6', 'key7'])) self.assertEqual(list(df.globalIndex), [0, 1, 2]) def test_non_inplace_dropping_single_column_in_column_distributed_dataframe( self): df = ParallelDataFrame(self.dict2, dist_data=False) self.assertEqual(df.globalShape, (3, 4)) new_df = df.drop('key4', axis=1, inplace=False) self.assertEqual(set(list(new_df.globalColumns)), set(['key1', 'key2', 'key3'])) self.assertEqual(list(new_df.globalIndex), [0, 1, 2]) def test_inplace_dropping_single_row_in_index_distributed_dataframe(self): df = ParallelDataFrame.from_dict(self.dict2, orient='index') self.assertEqual(df.globalShape, (4, 3)) df.drop('key4', axis=0, inplace=True) self.assertEqual(set(list(df.globalIndex)), set(['key1', 'key2', 'key3'])) self.assertEqual(list(df.globalColumns), [0, 1, 2]) def test_non_inplace_dropping_single_row_in_index_distributed_dataframe( self): df = ParallelDataFrame.from_dict(self.dict2, orient='index') self.assertEqual(df.globalShape, (4, 3)) new_df = df.drop('key4', axis=0, inplace=False) self.assertEqual(set(list(new_df.globalIndex)), set(['key1', 'key2', 'key3'])) self.assertEqual(list(new_df.globalColumns), [0, 1, 2]) def test_inplace_dropping_single_column_in_index_distributed_dataframe( self): df = ParallelDataFrame.from_dict(self.dict2, orient='index') self.assertEqual(df.globalShape, (4, 3)) df.drop(1, axis=1, inplace=True) self.assertEqual(set(list(df.globalIndex)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalColumns), [0, 2]) def test_inplace_dropping_single_row_in_column_distributed_dataframe(self): df = ParallelDataFrame(self.dict2, dist_data=False) self.assertEqual(df.globalShape, (3, 4)) df.drop(2, axis=0, inplace=True) self.assertEqual(set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalIndex), [0, 1]) if (get_pandas_version() >= 0.21): def test_inplace_dropping_single_row_in_column_distributed_dataframe_specifying_index( self): df = ParallelDataFrame(self.dict2, dist_data=False) self.assertEqual(df.globalShape, (3, 4)) df.drop(index=2, inplace=True) self.assertEqual(set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalIndex), [0, 1]) def test_inplace_dropping_single_row_replicated_dataframe(self): df = ParallelDataFrame(self.dict2, dist='replicated') df.drop(2, axis=0, inplace=True) self.assertEqual(set(list(df.globalColumns)), set(['key1', 'key2', 'key3', 'key4'])) self.assertEqual(list(df.globalIndex), [0, 1]) def test_non_inplace_dropping_single_column_replicated_dataframe(self): df = ParallelDataFrame(self.dict2, dist='replicated') new_df = df.drop('key4', axis=1, inplace=False) self.assertEqual(set(list(new_df.globalColumns)), set(['key1', 'key2', 'key3'])) self.assertEqual(list(new_df.globalIndex), [0, 1, 2]) #new index/column introduced in Pandas version 0.21 if (get_pandas_version() >= 0.21): def test_non_inplace_dropping_multiple_columns_replicated_dataframe( self): df = ParallelDataFrame(self.dict3, dist='replicated') new_df = df.drop(columns=['key4', 'key7'], inplace=False) self.assertEqual( set(list(new_df.globalColumns)), set(['key1', 'key2', 'key3', 'key5', 'key6', 'key8'])) self.assertEqual(list(new_df.globalIndex), [0, 1, 2]) def test_non_inplace_dropping_multiple_columns_and_row_in_same_call_replicated_dataframe( self): df = ParallelDataFrame(self.dict3, dist='replicated') new_df = df.drop(columns=['key4', 'key7'], index=1, inplace=False) self.assertEqual( set(list(new_df.globalColumns)), set(['key1', 'key2', 'key3', 'key5', 'key6', 'key8'])) self.assertEqual(list(new_df.globalIndex), [0, 2]) #Testing apply function---------------------------------------------------- #The examples below have been inspired by the examples given in the Pandas documentation def test_column_distributed_df_apply_function_sqrt_returns_distributed_df( self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) result = df1.apply(np.sqrt) df3 = result.apply(np.square) self.assertTrue(isinstance(result, ParallelDataFrame)) self.assertEqual(result.dist, 'distributed') self.assertFalse(result.equals(df1)) self.assertTrue(df1.equals(df3)) def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_True( self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) pd_result = self.pd_df1.apply(np.sum, axis=0, raw=True) result = df1.apply(np.sum, axis=0, raw=True) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'distributed') self.assertEqual(set(list(result.globalIndex)), set(list(pd_result.index))) self.assertTrue(result.collect().sort_index().equals( pd_result.sort_index())) def test_column_distributed_df_apply_function_sum_returns_distributed_series_raw_False( self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) pd_result = self.pd_df1.apply(np.sum, axis=0, raw=False) result = df1.apply(np.sum, axis=0, raw=False) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'distributed') self.assertEqual(set(list(result.globalIndex)), set(list(pd_result.index))) self.assertTrue(result.collect().sort_index().equals( pd_result.sort_index())) def test_replicated_df_apply_function_sqrt_returns_replicated_df(self): df1 = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(np.sqrt) result = df1.apply(np.sqrt) self.assertTrue(result.equals(pd_result)) self.assertEqual(result.dist, 'replicated') def test_replicated_df_apply_function_sum_axis0_returns_replicated_series( self): df1 = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(np.sum, axis=0) result = df1.apply(np.sum, axis=0) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result)) def test_replicated_df_apply_function_sum_axis1_returns_replicated_series( self): df1 = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(np.sum, axis=1) result = df1.apply(np.sum, axis=1) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result)) def test_replicated_df_apply_function_list_like_result_returns_replicated_series( self): df = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1) result = df.apply(lambda x: [1, 2], axis=1) self.assertTrue(isinstance(result, ParallelSeries)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result)) if (get_pandas_version() >= 0.23): def test_replicated_df_apply_function_list_like_result_expand_returns_replicated_df( self): df = ParallelDataFrame(self.pd_df1, dist='replicated') pd_result = self.pd_df1.apply(lambda x: [1, 2], axis=1, result_type='expand') result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') self.assertTrue(isinstance(result, ParallelDataFrame)) self.assertEqual(result.dist, 'replicated') self.assertTrue(result.equals(pd_result)) #Testing 'div' function--------------------------------------------- #The examples below have been inspired by the examples from the Pandas documentation def test_div_constant_replicated_df(self): df = ParallelDataFrame(self.pd_df1, dist='replicated') result = df.div(10) pd_result = self.pd_df1.div(10) self.assertTrue(result.equals(pd_result)) def test_div_constant_distributed_df(self): df1 = ParallelDataFrame(self.pd_df1, dist_data=False) pd_df2 = self.pd_df1.div(10) df2 = ParallelDataFrame(pd_df2, dist_data=False) result = df1.div(10) self.assertTrue(result.equals(df2)) def test_div_by_multiIndex_by_level_replicated_df(self): df = ParallelDataFrame(self.pd_df2, dist='replicated') rep_multindex = ParallelDataFrame(self.df_multindex, dist='replicated') result = df.div(rep_multindex, level=1, fill_value=0) pd_result = self.pd_df2.div(self.df_multindex, level=1, fill_value=0) self.assertTrue(result.equals(pd_result)) #Testing slicing-------------------------------------------------- def test_slicing_with_single_label_getting_dist_series_from_column_distributed_df( self): d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]} pd_df = pd.DataFrame(data=d) dist_df = ParallelDataFrame(data=d, dist_data=False) dist_series = dist_df.loc[1] pd_series = pd_df.loc[1] self.assertTrue(isinstance(dist_series, ParallelSeries)) self.assertEqual(dist_series.dist, 'distributed') self.assertTrue(dist_series.collect().sort_index().equals( pd_series.sort_index())) def test_slicing_with_slice_object_getting_dist_df_in_column_distributed_df( self): d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]} pd_df = pd.DataFrame(data=d) dist_df = ParallelDataFrame(data=d, dist_data=False) dist_slice = dist_df.loc[0:1] pd_slice = pd_df.loc[0:1] pd_slice_dist = ParallelDataFrame(data=pd_slice, dist_data=False) self.assertTrue(isinstance(dist_slice, ParallelDataFrame)) self.assertEqual(dist_slice.dist, 'distributed') self.assertTrue(dist_slice.equals(pd_slice_dist)) def test_slicing_with_single_label_getting_rep_series_from_replicated_df( self): d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]} pd_df = pd.DataFrame(data=d) rep_df = ParallelDataFrame(data=d, dist='replicated') rep_series = rep_df.loc[1] pd_series = pd_df.loc[1] self.assertTrue(isinstance(rep_series, ParallelSeries)) self.assertEqual(rep_series.dist, 'replicated') self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index())) def test_slicing_with_list_of_labels_getting_rep_df_from_replicated_df( self): d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]} pd_df = pd.DataFrame(data=d) rep_df = ParallelDataFrame(data=d, dist='replicated') rep_slice = rep_df.loc[[0, 3]] pd_slice = pd_df.loc[[0, 3]] self.assertTrue(isinstance(rep_slice, ParallelDataFrame)) self.assertEqual(rep_slice.dist, 'replicated') self.assertTrue(rep_slice.sort_index().equals(pd_slice.sort_index())) def test_slicing_getting_cell_value_in_replicated_df(self): d = {'col1': [1, 2], 'col2': [3, 4], 'col3': [5, 6]} pd_df = pd.DataFrame(data=d) rep_df = ParallelDataFrame(data=d, dist='replicated') rep_series = rep_df.loc[1, 'col2'] pd_series = pd_df.loc[1, 'col2'] self.assertEqual(rep_series, pd_series) def test_slicing_with_boolean_array_getting_rep_df_from_replicated_df( self): d = {'col1': [1, 2, 4, 5], 'col2': [3, 4, 6, 7], 'col3': [5, 6, 1, 3]} pd_df = pd.DataFrame(data=d) rep_df = ParallelDataFrame(data=d, dist='replicated') rep_series = rep_df.loc[[True, False, False, True]] pd_series = pd_df.loc[[True, False, False, True]] self.assertTrue(rep_series.sort_index().equals(pd_series.sort_index())) #testing corr---------------------------------------------------------------------- def test_corr_with_col_distributed_dataframe(self): pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6), (.2, .1, .1)], columns=['dogs', 'cats', 'rats']) dist_df = ParallelDataFrame(pd_df, dist_data=False) dist_corr = dist_df.corr() pd_corr = pd_df.corr() #compare values of each row (rounded to 6 digits) for row in dist_corr.globalIndex: self.assertEqual( list(dist_corr.loc[row].collect().sort_index().round(6)), list(pd_corr.loc[row].sort_index().round(6))) def test_corr_with_replicated_dataframe(self): pd_df = pd.DataFrame([(.2, .3, .4), (.0, .6, .9), (.6, .0, .6), (.2, .1, .1)], columns=['dogs', 'cats', 'rats']) rep_df = ParallelDataFrame(pd_df, dist='replicated') rep_corr = rep_df.corr() pd_corr = pd_df.corr() self.assertTrue(rep_corr.equals(pd_corr))
def drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise'): pandas_version = get_pandas_version() if (axis == 'columns'): axis = 1 if (axis == 'index'): axis = 0 super_return = self if (self.dist == 'distributed' and ((axis == 1 or columns != None) and self.orient == 'columns') or ((axis == 0 or index != None) and self.orient == 'index')): if (index != None): axis = 0 labels = index index = None elif (columns != None): axis = 1 labels = columns columns = None col_or_row_names = self.global_to_local.keys() local_labels = None if isinstance(labels, list): local_labels = [] for a_label in labels: if (a_label not in col_or_row_names): raise Exception("Column/Row does not exist!") elif ((axis == 1 and a_label in self.columns.values) or (axis == 0 and a_label in self.index.values)): local_labels.append(a_label) elif (labels not in col_or_row_names): #incase labels is not a list raise Exception("Column does not exist!") elif ((axis == 1 and labels in self.columns.values) or (axis == 0 and labels in self.index.values)): local_labels = labels if (inplace == True and self.get_global_to_local() != None): self._global_to_local = None #perform drop if (local_labels is not None and len(local_labels) != 0): if (pandas_version >= 0.21): super_return = super().drop(local_labels, axis, index, columns, level, inplace, errors) else: super_return = super().drop(local_labels, axis, level, inplace, errors) elif (inplace == True ): # when some other node had the item to be dropped return self else: self.__constructor(data=self, dist=self.dist, comm=self.comm, orient=self.orient, dist_data=True) # for replicated distribution OR # for dropping a row in a column-distribution OR # a column in row-distribution else: if (pandas_version >= 0.21): super_return = super().drop(labels, axis, index, columns, level, inplace, errors) else: super_return = super().drop(labels, axis, level, inplace, errors) if (inplace == True): return self else: return self.__constructor(data=super_return, dist=self.dist, comm=self.comm, orient=self.orient)