def test_agg_multikey_parallel(self): def test_impl(in_A, in_B, in_C): df = pd.DataFrame({'A': in_A, 'B': in_B, 'C': in_C}) A = df.groupby(['A', 'C'])['B'].sum() return A.sum() hpat_func = self.jit( locals={ 'in_A:input': 'distributed', 'in_B:input': 'distributed', 'in_C:input': 'distributed' })(test_impl) df = pd.DataFrame({ 'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7], 'C': [3, 5, 6, 5, 4, 4, 3] }) start, end = get_start_end(len(df)) h_A = df.A.values[start:end] h_B = df.B.values[start:end] h_C = df.C.values[start:end] p_A = df.A.values p_B = df.B.values p_C = df.C.values h_res = hpat_func(h_A, h_B, h_C) p_res = test_impl(p_A, p_B, p_C) self.assertEqual(h_res, p_res)
def test_reduce_filter1(self): import sys dtypes = ['float32', 'float64', 'int32', 'int64'] funcs = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax'] for (dtype, func) in itertools.product(dtypes, funcs): # loc allreduce doesn't support int64 on windows if (sys.platform.startswith('win') and dtype == 'int64' and func in ['argmin', 'argmax']): continue func_text = """def f(A): A = A[A>5] return A.{}() """.format(func) loc_vars = {} exec(func_text, {'np': np}, loc_vars) test_impl = loc_vars['f'] hpat_func = self.jit(locals={'A:input': 'distributed'})(test_impl) n = 21 start, end = get_start_end(n) np.random.seed(0) A = np.random.randint(0, 10, n).astype(dtype) np.testing.assert_almost_equal(hpat_func(A[start:end]), test_impl(A), decimal=3, err_msg="{} on {}".format( func, dtype)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_join_datetime_parallel1(self): def test_impl(df1, df2): df3 = pd.merge(df1, df2, on='time') return (df3.A.sum(), df3.time.max(), df3.B.sum()) hpat_func = sdc.jit(distributed=['df1', 'df2'])(test_impl) df1 = pd.DataFrame({ 'time': pd.DatetimeIndex(['2017-01-03', '2017-01-06', '2017-02-21']), 'B': [4, 5, 6] }) df2 = pd.DataFrame({ 'time': pd.DatetimeIndex(['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9] }) start1, end1 = get_start_end(len(df1)) start2, end2 = get_start_end(len(df2)) self.assertEqual( hpat_func(df1.iloc[start1:end1], df2.iloc[start2:end2]), test_impl(df1, df2)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_str_split_parallel(self): def test_impl(df): B = df.A.str.split(',') return B n = 5 start, end = get_start_end(n) A = ['AB,CC', 'C,ABB,D', 'CAD', 'CA,D', 'AA,,D'] df = pd.DataFrame({'A': A[start:end]}) hpat_func = self.jit(distributed={'df', 'B'})(test_impl) pd.testing.assert_series_equal( hpat_func(df), test_impl(df), check_names=False) self.assertEqual(count_array_REPs(), 3) self.assertEqual(count_parfor_REPs(), 0)
def test_str_replace_regex_parallel(self): def test_impl(df): B = df.A.str.replace('AB*', 'EE', regex=True) return B n = 5 A = ['ABCC', 'CABBD', 'CCD', 'CCDAABB', 'ED'] start, end = get_start_end(n) df = pd.DataFrame({'A': A[start:end]}) hpat_func = self.jit(distributed={'df', 'B'})(test_impl) pd.testing.assert_series_equal( hpat_func(df), test_impl(df), check_names=False) self.assertEqual(count_array_REPs(), 3) self.assertEqual(count_parfor_REPs(), 0)
def test_df_input_dist1(self): def test_impl(df): return df.B.sum() n = 121 A = [3, 4, 5, 6, 1] B = [5, 6, 2, 1, 3] n = 5 start, end = get_start_end(n) df = pd.DataFrame({'A': A, 'B': B}) df_h = pd.DataFrame({'A': A[start:end], 'B': B[start:end]}) hpat_func = sdc.jit(distributed={'df'})(test_impl) np.testing.assert_almost_equal(hpat_func(df_h), test_impl(df)) self.assertEqual(count_array_REPs(), 0) self.assertEqual(count_parfor_REPs(), 0)
def test_var_dist1(self): def test_impl(A, B): df = pd.DataFrame({'A': A, 'B': B}) df2 = df.groupby('A', as_index=False)['B'].sum() # TODO: fix handling of df setitem to force match of array dists # probably with a new node that is appended to the end of basic block # df2['C'] = np.full(len(df2.B), 3, np.int8) # TODO: full_like for Series df2['C'] = np.full_like(df2.B.values, 3, np.int8) return df2 A = np.array([1, 1, 2, 3]) B = np.array([3, 4, 5, 6]) hpat_func = self.jit(locals={'A:input': 'distributed', 'B:input': 'distributed', 'df2:return': 'distributed'})(test_impl) start, end = get_start_end(len(A)) df2 = hpat_func(A[start:end], B[start:end])
def test_join_left_parallel1(self): """ """ def test_impl(A1, B1, C1, A2, B2, D2): df1 = pd.DataFrame({'A': A1, 'B': B1, 'C': C1}) df2 = pd.DataFrame({'A': A2, 'B': B2, 'D': D2}) df3 = df1.merge(df2, on=('A', 'B')) return df3.C.sum() + df3.D.sum() hpat_func = sdc.jit( locals={ 'A1:input': 'distributed', 'B1:input': 'distributed', 'C1:input': 'distributed', })(test_impl) df1 = pd.DataFrame({ 'A': [3, 1, 1, 3, 4], 'B': [1, 2, 3, 2, 3], 'C': [7, 8, 9, 4, 5] }) df2 = pd.DataFrame({ 'A': [2, 1, 4, 4, 3], 'B': [1, 3, 2, 3, 2], 'D': [1, 2, 3, 4, 8] }) start, end = get_start_end(len(df1)) h_A1 = df1.A.values[start:end] h_B1 = df1.B.values[start:end] h_C1 = df1.C.values[start:end] h_A2 = df2.A.values h_B2 = df2.B.values h_D2 = df2.D.values p_A1 = df1.A.values p_B1 = df1.B.values p_C1 = df1.C.values p_A2 = df2.A.values p_B2 = df2.B.values p_D2 = df2.D.values h_res = hpat_func(h_A1, h_B1, h_C1, h_A2, h_B2, h_D2) p_res = test_impl(p_A1, p_B1, p_C1, p_A2, p_B2, p_D2) self.assertEqual(h_res, p_res) self.assertEqual(count_array_OneDs(), 3)