def test_get_thor_file_uses_custom_max_workers(self, mock): mock.return_value = ThreadPoolExecutor(max_workers=15) file_name = "test_get_thor_file_uses_custom_max_workers" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) get_thor_file(self.conn, file_name, max_workers=2) mock.assert_called_with(max_workers=2)
def test_get_thor_file_chunks_when_num_rows_equal_to_chunksize(self, mock): file_name = "test_get_thor_file_chunks_when_num_rows_equal_to_chunksize" mock.return_value = {'int': ['1'], '__fileposition__': ['0']} self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1) mock.assert_called_with(file_name, 1, 1, 3, 60)
def test_get_thor_file_uses_max_sleep(self, mock): mock.return_value = {"int": [1], "__fileposition__": [0]} file_name = "test_get_thor_file_uses_max_sleep" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, False, None) get_thor_file(self.conn, file_name, max_sleep=120) mock.assert_called_with(file_name, 0, 2, 3, 120)
def test_get_thor_file_works_when_chunksize_is_zero(self): file_name = "test_get_thor_file_works_when_chunksize_is_zero" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) with self.assertRaises(ZeroDivisionError): get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=0)
def test_get_thor_file_chunks_when_num_rows_above_chunksize(self, mock): file_name = "test_get_thor_file_chunks_when_" \ "num_rows_greater_than_chunksize" mock.return_value = {'int': ['1'], '__fileposition__': ['0']} self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1) expected = [ unittest.mock.call(file_name, 0, 1, 3, 60), unittest.mock.call(file_name, 1, 1, 3, 60) ] self.assertEqual(expected, mock.call_args_list)
def test_get_thor_file_uses_dict_of_dtypes_with_extra_cols(self): file_name = "test_get_thor_file_uses_dict_of_dtypes_with_extra_cols" self.conn.run_ecl_string( "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], " "{{STRING str; BOOLEAN bool; INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) with self.assertRaises(KeyError): get_thor_file(self.conn, file_name, dtype={ "bool": bool, "int": str, "made_up": str })
def test_spray_file_string(self): thor_file = '~thor::test_spray_file_string' overwrite = True expire = 1 delete_workunit = True chunk_size = 10000 max_workers = 3 conn = hpycc.Connection("user", test_conn=False) col_1_values = ['1', '3', '5', '6'] col_2_values = ['aa', 'ab', 'ac', 'ad'] df = pd.DataFrame({ "a": col_1_values, "b": col_2_values }).sort_values('a') with TemporaryDirectory() as d: p = os.path.join(d, "test.csv") df.to_csv(p, index=False) spray_file(conn, p, thor_file, overwrite, expire, chunk_size, max_workers, delete_workunit) res = get_thor_file(connection=conn, thor_file=thor_file)[['a', 'b']] pd.testing.assert_frame_equal(df, res)
def test_get_thor_file_uses_dict_of_dtypes(self): file_name = "test_get_thor_file_uses_dict_of_dtypes" self.conn.run_ecl_string( "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], " "{{STRING str; BOOLEAN bool; INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) res = get_thor_file(self.conn, file_name, dtype={ "str": int, "bool": bool, "int": str, "__fileposition__": str }) expected = pd.DataFrame({ "int": ["1", "2"], "str": [1, 2], "bool": [True, False], "__fileposition__": ["0", "14"] }).astype({ "str": int, "bool": bool, "int": str, "__fileposition__": str }) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1))
def test_concatenate_logical_files_concatenates_one_file(self): thor_file = '~thor::test_concatenate_logical_files_concatenates_one_file' overwrite = True expire = 1 delete_workunit = True conn = hpycc.Connection("user", test_conn=False) output_names = ['~a2'] col_1_values = ['1'] col_2_values = ['aa'] [ send_file_chunks(conn, CONCAT_SCRIPT_BASE % (col1, col2, nam)) for col1, col2, nam in zip(col_1_values, col_2_values, output_names) ] _concatenate_logical_files(conn, output_names, thor_file, 'STRING a; STRING b;', overwrite, expire, delete_workunit) res = get_thor_file(connection=conn, thor_file=thor_file)[['a', 'b']].sort_values("a") expected_result = pd.DataFrame({ "a": col_1_values, "b": col_2_values }).sort_values("a") pd.testing.assert_frame_equal(expected_result, res)
def test_concatenate_logical_files_concatenates_files(self): thor_file = '~thor::testsprayconcatenatelogicalfiles' overwrite = True expire = 1 delete_workunit = True output_names = ['~a1', '~b1', '~c1', '~x1'] col_1_values = ['1', '3', '5', '6'] col_2_values = ['aa', 'ab', 'ac', 'ad'] [ send_file_chunks(self.conn, CONCAT_SCRIPT_BASE % (col1, col2, nam)) for col1, col2, nam in zip(col_1_values, col_2_values, output_names) ] _concatenate_logical_files(self.conn, output_names, thor_file, 'STRING a; STRING b;', overwrite, expire, delete_workunit) res = get_thor_file(connection=self.conn, thor_file=thor_file)[['a', 'b']].sort_values("a") expected_result = pd.DataFrame({ "a": col_1_values, "b": col_2_values }).sort_values("a") pd.testing.assert_frame_equal(expected_result.reset_index(drop=True), res.reset_index(drop=True), check_names=False)
def test_get_thor_file_parses_column_types_correctly(self): i = 1 d = 1.5 u = "U'ABC'" s = "'ABC'" b = "TRUE" x = "x'ABC'" es = "ABC" types = [("INTEGER", "int", i), ("INTEGER1", "int1", i), ("UNSIGNED INTEGER", "unsigned_int", i), ("UNSIGNED INTEGER1", "unsigned_int_1", i), ("UNSIGNED8", "is_unsigned_8", i), ("UNSIGNED", "usigned", i), ("DECIMAL10", "dec10", d, round(d)), ("DECIMAL5_3", "dec5_3", d), ("UNSIGNED DECIMAL10", "unsigned_dec10", d, round(d)), ("UNSIGNED DECIMAL5_3", "unsigned_decl5_3", d), ("UDECIMAL10", "udec10", d, round(d)), ("UDECIMAL5_3", "udec5_3", d), ("REAL", "is_real", d), ("REAL4", "is_real4", d), ("UNICODE", "ucode", u, es), ("UNICODE_de", "ucode_de", u, es), ("UNICODE3", "ucode4", u, es), ("UNICODE_de3", "ucode_de4", u, es), ("UTF8", "is_utf8", u, es), ("UTF8_de", "is_utf8_de", u, es), ("STRING", "str", s, es), ("STRING3", "str1", s, es), ("ASCII STRING", "ascii_str", s, es), ("ASCII STRING3", "ascii_str1", s, es), ("EBCDIC STRING", "ebcdic_str", s, es), ("EBCDIC STRING3", "ebcdic_str1", s, es), ("BOOLEAN", "bool", b, True), ("DATA", "is_data", x, "0ABC"), ("DATA3", "is_data_16", x, "0ABC00"), ("VARUNICODE", "varucode", u, es), ("VARUNICODE_de", "varucode_de", u, es), ("VARUNICODE3", "varucode4", u, es), ("VARUNICODE_de3", "varucode_de4", u, es), ("VARSTRING", "varstr", u, es), ("VARSTRING3", "varstr3", u, es), ("QSTRING", "qstr", s, es), ("QSTRING3", "qstr8", s, es)] for t in types: file_name = ("test_get_thor_file_parses_column_types" "_correctly_{}").format(t[1]) self.conn.run_ecl_string( "a := DATASET([{{{}}}], {{{} {};}}); " "OUTPUT(a,,'~{}');".format(t[2], t[0], t[1], file_name), True, True, None) try: expected_val = t[3] except IndexError: expected_val = t[2] a = get_thor_file(connection=self.conn, thor_file=file_name, dtype=None) expected = pd.DataFrame( { t[1]: [expected_val], "__fileposition__": [0] }, index=[0]) pd.testing.assert_frame_equal(expected.sort_index(axis=1), a.sort_index(axis=1), check_dtype=False)
def test_get_thor_file_uses_generates_chunk_size_325000row_1workers( self, mock): file_name = "test_get_thor_file_uses_generates_" \ "chunk_size_325000row_1workers" mock.return_value = {'int': ['1'], '__fileposition__': ['0']} self.conn.run_ecl_string( "a := DATASET([{}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(",".join(["{1}"] * 350000), file_name), True, True, None) get_thor_file(connection=self.conn, thor_file=file_name, max_workers=1) expected = [ unittest.mock.call(file_name, 0, 325000, 3, 60), unittest.mock.call(file_name, 325000, 25000, 3, 60) ] self.assertEqual(expected[0], mock.call_args_list[0]) self.assertEqual(expected[1], mock.call_args_list[1])
def test_get_thor_file_returns_empty_dataset(self): file_name = '~test_get_thor_file_returns_empty_dataset' self.conn.run_ecl_string( "a := DATASET([], {INTEGER int;}); " "OUTPUT(a, ,'%s');" % file_name, True, True, None) res = get_thor_file(connection=self.conn, thor_file=file_name) expected = pd.DataFrame(columns=["int", "__fileposition__"]) pd.testing.assert_frame_equal(expected, res)
def test_get_thor_file_returns_1000000_row_dataset(self): file_name = '~test_get_thor_file_returns_1000000_row_dataset' lots_of_1s = "[" + ",".join(["{1}"] * 1000000) + "]" self.conn.run_ecl_string( "a := DATASET({}, {{INTEGER int;}}); " "OUTPUT(a,,'{}');".format(lots_of_1s, file_name), True, True, None) res = get_thor_file(connection=self.conn, thor_file=file_name) expected = pd.DataFrame({"int": [1] * 1000000}, dtype=np.int32) pd.testing.assert_series_equal( expected.sort_index(axis=1)["int"], res.sort_index(axis=1)["int"])
def test_get_thor_file_returns_a_set(self): file_name = "test_get_thor_file_returns_a_set" s = ("a := DATASET([{{[1, 2, 3]}}], {{SET OF INTEGER set;}}); " "OUTPUT(a,,'~{}');").format(file_name) self.conn.run_ecl_string(s, True, True, None) res = get_thor_file(self.conn, file_name) expected = pd.DataFrame({ "set": [[1, 2, 3]], "__fileposition__": 0 }, dtype=np.int32) self.assertEqual(res.set.values[0], [1, 2, 3]) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1))
def test_get_thor_file_uses_single_dtype(self): file_name = "test_get_thor_file_uses_single_dtype" self.conn.run_ecl_string( "a := DATASET([{{'1'}}, {{'2'}}], {{STRING int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) res = get_thor_file(self.conn, file_name, dtype=int) expected = pd.DataFrame({ "int": [1, 2], "__fileposition__": [0, 5] }, dtype=np.int32) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1))
def test_get_thor_file_returns_single_row_dataset(self): file_name = "test_get_thor_file_returns_single_row_dataset" self.conn.run_ecl_string( "a := DATASET([{1}], {INTEGER int;}); " "OUTPUT(a,,'~%s');" % file_name, True, True, None) res = get_thor_file( connection=self.conn, thor_file="test_get_thor_file_returns_single_row_dataset") expected = pd.DataFrame({ "int": [1], "__fileposition__": 0 }, dtype=np.int32) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1))
def test_get_thor_file_works_when_num_rows_equal_to_chunksize(self): file_name = "test_get_thor_file_works_when_num_rows_equal_to_chunksize" self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) res = get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=2) expected = pd.DataFrame({ "int": [1, 2], "__fileposition__": [0, 8] }, dtype=np.int32) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1))
def test_get_thor_file_works_when_num_rows_greater_than_chunksize(self): file_name = ("test_get_thor_file_works_when_num_rows_greater_than_" "chunksize") self.conn.run_ecl_string( "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); " "OUTPUT(a,,'~{}');".format(file_name), True, True, None) res = get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1).sort_index(axis=1) expected = pd.DataFrame({ "int": [1, 2], "__fileposition__": [0, 8] }, dtype=np.int32).sort_index(axis=1) res = res.sort_values("__fileposition__").reset_index(drop=True) pd.testing.assert_frame_equal(expected.sort_index(axis=1), res.sort_index(axis=1), check_dtype=False)
def test_spray_file_df(self): thor_file = '~thor::test_spray_file_df' overwrite = True expire = 1 delete_workunit = True chunk_size = 10000 max_workers = 3 conn = hpycc.Connection("user", test_conn=False) col_1_values = ['1', '3', '5', '6'] col_2_values = ['aa', 'ab', 'ac', 'ad'] df = pd.DataFrame({ "a": col_1_values, "b": col_2_values }).sort_values('a') spray_file(conn, df, thor_file, overwrite, expire, chunk_size, max_workers, delete_workunit) res = get_thor_file(connection=conn, thor_file=thor_file)[['a', 'b']] pd.testing.assert_frame_equal(df, res)
def test_spray_file_string_smallest_chunks_many_workers(self): thor_file = '~test_spray_file_string_smallest_chunks_many_workers' overwrite = True expire = 1 delete_workunit = True chunk_size = 1 max_workers = 100 conn = hpycc.Connection("user", test_conn=False) col_1_values = ['1', '3', '5', '7', '9', '11', '13'] col_2_values = ['a', 'b', 'c', 'd', 'e', 'f', 'g'] df = pd.DataFrame({ "a": col_1_values, "b": col_2_values }).sort_values('a').reset_index(drop=True) spray_file(conn, df, thor_file, overwrite, expire, chunk_size, max_workers, delete_workunit) res = get_thor_file(connection=conn, thor_file=thor_file)[[ 'a', 'b' ]].sort_values('a').reset_index(drop=True) pd.testing.assert_frame_equal(df, res)