Exemplo n.º 1
0
 def test_get_thor_file_uses_custom_max_workers(self, mock):
     mock.return_value = ThreadPoolExecutor(max_workers=15)
     file_name = "test_get_thor_file_uses_custom_max_workers"
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     get_thor_file(self.conn, file_name, max_workers=2)
     mock.assert_called_with(max_workers=2)
Exemplo n.º 2
0
 def test_get_thor_file_chunks_when_num_rows_equal_to_chunksize(self, mock):
     file_name = "test_get_thor_file_chunks_when_num_rows_equal_to_chunksize"
     mock.return_value = {'int': ['1'], '__fileposition__': ['0']}
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1)
     mock.assert_called_with(file_name, 1, 1, 3, 60)
Exemplo n.º 3
0
 def test_get_thor_file_uses_max_sleep(self, mock):
     mock.return_value = {"int": [1], "__fileposition__": [0]}
     file_name = "test_get_thor_file_uses_max_sleep"
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, False, None)
     get_thor_file(self.conn, file_name, max_sleep=120)
     mock.assert_called_with(file_name, 0, 2, 3, 120)
Exemplo n.º 4
0
 def test_get_thor_file_works_when_chunksize_is_zero(self):
     file_name = "test_get_thor_file_works_when_chunksize_is_zero"
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     with self.assertRaises(ZeroDivisionError):
         get_thor_file(connection=self.conn,
                       thor_file=file_name,
                       chunk_size=0)
Exemplo n.º 5
0
    def test_get_thor_file_chunks_when_num_rows_above_chunksize(self, mock):
        file_name = "test_get_thor_file_chunks_when_" \
                    "num_rows_greater_than_chunksize"
        mock.return_value = {'int': ['1'], '__fileposition__': ['0']}
        self.conn.run_ecl_string(
            "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
            "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
        get_thor_file(connection=self.conn, thor_file=file_name, chunk_size=1)
        expected = [
            unittest.mock.call(file_name, 0, 1, 3, 60),
            unittest.mock.call(file_name, 1, 1, 3, 60)
        ]

        self.assertEqual(expected, mock.call_args_list)
Exemplo n.º 6
0
 def test_get_thor_file_uses_dict_of_dtypes_with_extra_cols(self):
     file_name = "test_get_thor_file_uses_dict_of_dtypes_with_extra_cols"
     self.conn.run_ecl_string(
         "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], "
         "{{STRING str; BOOLEAN bool; INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     with self.assertRaises(KeyError):
         get_thor_file(self.conn,
                       file_name,
                       dtype={
                           "bool": bool,
                           "int": str,
                           "made_up": str
                       })
Exemplo n.º 7
0
    def test_spray_file_string(self):
        thor_file = '~thor::test_spray_file_string'
        overwrite = True
        expire = 1
        delete_workunit = True
        chunk_size = 10000
        max_workers = 3
        conn = hpycc.Connection("user", test_conn=False)

        col_1_values = ['1', '3', '5', '6']
        col_2_values = ['aa', 'ab', 'ac', 'ad']
        df = pd.DataFrame({
            "a": col_1_values,
            "b": col_2_values
        }).sort_values('a')

        with TemporaryDirectory() as d:
            p = os.path.join(d, "test.csv")
            df.to_csv(p, index=False)
            spray_file(conn, p, thor_file, overwrite, expire, chunk_size,
                       max_workers, delete_workunit)

        res = get_thor_file(connection=conn, thor_file=thor_file)[['a', 'b']]

        pd.testing.assert_frame_equal(df, res)
Exemplo n.º 8
0
 def test_get_thor_file_uses_dict_of_dtypes(self):
     file_name = "test_get_thor_file_uses_dict_of_dtypes"
     self.conn.run_ecl_string(
         "a := DATASET([{{'1', TRUE, 1}}, {{'2', FALSE, 2}}], "
         "{{STRING str; BOOLEAN bool; INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     res = get_thor_file(self.conn,
                         file_name,
                         dtype={
                             "str": int,
                             "bool": bool,
                             "int": str,
                             "__fileposition__": str
                         })
     expected = pd.DataFrame({
         "int": ["1", "2"],
         "str": [1, 2],
         "bool": [True, False],
         "__fileposition__": ["0", "14"]
     }).astype({
         "str": int,
         "bool": bool,
         "int": str,
         "__fileposition__": str
     })
     pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                   res.sort_index(axis=1))
Exemplo n.º 9
0
    def test_concatenate_logical_files_concatenates_one_file(self):
        thor_file = '~thor::test_concatenate_logical_files_concatenates_one_file'
        overwrite = True
        expire = 1
        delete_workunit = True
        conn = hpycc.Connection("user", test_conn=False)

        output_names = ['~a2']
        col_1_values = ['1']
        col_2_values = ['aa']

        [
            send_file_chunks(conn, CONCAT_SCRIPT_BASE % (col1, col2, nam)) for
            col1, col2, nam in zip(col_1_values, col_2_values, output_names)
        ]

        _concatenate_logical_files(conn, output_names, thor_file,
                                   'STRING a; STRING b;', overwrite, expire,
                                   delete_workunit)

        res = get_thor_file(connection=conn,
                            thor_file=thor_file)[['a', 'b']].sort_values("a")
        expected_result = pd.DataFrame({
            "a": col_1_values,
            "b": col_2_values
        }).sort_values("a")

        pd.testing.assert_frame_equal(expected_result, res)
Exemplo n.º 10
0
    def test_concatenate_logical_files_concatenates_files(self):
        thor_file = '~thor::testsprayconcatenatelogicalfiles'
        overwrite = True
        expire = 1
        delete_workunit = True

        output_names = ['~a1', '~b1', '~c1', '~x1']
        col_1_values = ['1', '3', '5', '6']
        col_2_values = ['aa', 'ab', 'ac', 'ad']

        [
            send_file_chunks(self.conn, CONCAT_SCRIPT_BASE % (col1, col2, nam))
            for col1, col2, nam in zip(col_1_values, col_2_values,
                                       output_names)
        ]
        _concatenate_logical_files(self.conn, output_names, thor_file,
                                   'STRING a; STRING b;', overwrite, expire,
                                   delete_workunit)

        res = get_thor_file(connection=self.conn,
                            thor_file=thor_file)[['a', 'b']].sort_values("a")
        expected_result = pd.DataFrame({
            "a": col_1_values,
            "b": col_2_values
        }).sort_values("a")

        pd.testing.assert_frame_equal(expected_result.reset_index(drop=True),
                                      res.reset_index(drop=True),
                                      check_names=False)
Exemplo n.º 11
0
    def test_get_thor_file_parses_column_types_correctly(self):
        i = 1
        d = 1.5
        u = "U'ABC'"
        s = "'ABC'"
        b = "TRUE"
        x = "x'ABC'"
        es = "ABC"
        types = [("INTEGER", "int", i), ("INTEGER1", "int1", i),
                 ("UNSIGNED INTEGER", "unsigned_int", i),
                 ("UNSIGNED INTEGER1", "unsigned_int_1", i),
                 ("UNSIGNED8", "is_unsigned_8", i), ("UNSIGNED", "usigned", i),
                 ("DECIMAL10", "dec10", d, round(d)),
                 ("DECIMAL5_3", "dec5_3", d),
                 ("UNSIGNED DECIMAL10", "unsigned_dec10", d, round(d)),
                 ("UNSIGNED DECIMAL5_3", "unsigned_decl5_3", d),
                 ("UDECIMAL10", "udec10", d, round(d)),
                 ("UDECIMAL5_3", "udec5_3", d), ("REAL", "is_real", d),
                 ("REAL4", "is_real4", d), ("UNICODE", "ucode", u, es),
                 ("UNICODE_de", "ucode_de", u, es),
                 ("UNICODE3", "ucode4", u, es),
                 ("UNICODE_de3", "ucode_de4", u, es),
                 ("UTF8", "is_utf8", u, es), ("UTF8_de", "is_utf8_de", u, es),
                 ("STRING", "str", s, es), ("STRING3", "str1", s, es),
                 ("ASCII STRING", "ascii_str", s, es),
                 ("ASCII STRING3", "ascii_str1", s, es),
                 ("EBCDIC STRING", "ebcdic_str", s, es),
                 ("EBCDIC STRING3", "ebcdic_str1", s, es),
                 ("BOOLEAN", "bool", b, True), ("DATA", "is_data", x, "0ABC"),
                 ("DATA3", "is_data_16", x, "0ABC00"),
                 ("VARUNICODE", "varucode", u, es),
                 ("VARUNICODE_de", "varucode_de", u, es),
                 ("VARUNICODE3", "varucode4", u, es),
                 ("VARUNICODE_de3", "varucode_de4", u, es),
                 ("VARSTRING", "varstr", u, es),
                 ("VARSTRING3", "varstr3", u, es), ("QSTRING", "qstr", s, es),
                 ("QSTRING3", "qstr8", s, es)]
        for t in types:
            file_name = ("test_get_thor_file_parses_column_types"
                         "_correctly_{}").format(t[1])
            self.conn.run_ecl_string(
                "a := DATASET([{{{}}}], {{{} {};}}); "
                "OUTPUT(a,,'~{}');".format(t[2], t[0], t[1], file_name), True,
                True, None)
            try:
                expected_val = t[3]
            except IndexError:
                expected_val = t[2]
            a = get_thor_file(connection=self.conn,
                              thor_file=file_name,
                              dtype=None)
            expected = pd.DataFrame(
                {
                    t[1]: [expected_val],
                    "__fileposition__": [0]
                }, index=[0])

            pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                          a.sort_index(axis=1),
                                          check_dtype=False)
Exemplo n.º 12
0
    def test_get_thor_file_uses_generates_chunk_size_325000row_1workers(
            self, mock):
        file_name = "test_get_thor_file_uses_generates_" \
                    "chunk_size_325000row_1workers"

        mock.return_value = {'int': ['1'], '__fileposition__': ['0']}

        self.conn.run_ecl_string(
            "a := DATASET([{}], {{INTEGER int;}}); "
            "OUTPUT(a,,'~{}');".format(",".join(["{1}"] * 350000), file_name),
            True, True, None)
        get_thor_file(connection=self.conn, thor_file=file_name, max_workers=1)
        expected = [
            unittest.mock.call(file_name, 0, 325000, 3, 60),
            unittest.mock.call(file_name, 325000, 25000, 3, 60)
        ]
        self.assertEqual(expected[0], mock.call_args_list[0])
        self.assertEqual(expected[1], mock.call_args_list[1])
Exemplo n.º 13
0
    def test_get_thor_file_returns_empty_dataset(self):
        file_name = '~test_get_thor_file_returns_empty_dataset'
        self.conn.run_ecl_string(
            "a := DATASET([], {INTEGER int;}); "
            "OUTPUT(a, ,'%s');" % file_name, True, True, None)
        res = get_thor_file(connection=self.conn, thor_file=file_name)
        expected = pd.DataFrame(columns=["int", "__fileposition__"])

        pd.testing.assert_frame_equal(expected, res)
Exemplo n.º 14
0
 def test_get_thor_file_returns_1000000_row_dataset(self):
     file_name = '~test_get_thor_file_returns_1000000_row_dataset'
     lots_of_1s = "[" + ",".join(["{1}"] * 1000000) + "]"
     self.conn.run_ecl_string(
         "a := DATASET({}, {{INTEGER int;}}); "
         "OUTPUT(a,,'{}');".format(lots_of_1s, file_name), True, True, None)
     res = get_thor_file(connection=self.conn, thor_file=file_name)
     expected = pd.DataFrame({"int": [1] * 1000000}, dtype=np.int32)
     pd.testing.assert_series_equal(
         expected.sort_index(axis=1)["int"],
         res.sort_index(axis=1)["int"])
Exemplo n.º 15
0
 def test_get_thor_file_returns_a_set(self):
     file_name = "test_get_thor_file_returns_a_set"
     s = ("a := DATASET([{{[1, 2, 3]}}], {{SET OF INTEGER set;}}); "
          "OUTPUT(a,,'~{}');").format(file_name)
     self.conn.run_ecl_string(s, True, True, None)
     res = get_thor_file(self.conn, file_name)
     expected = pd.DataFrame({
         "set": [[1, 2, 3]],
         "__fileposition__": 0
     },
                             dtype=np.int32)
     self.assertEqual(res.set.values[0], [1, 2, 3])
     pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                   res.sort_index(axis=1))
Exemplo n.º 16
0
    def test_get_thor_file_uses_single_dtype(self):
        file_name = "test_get_thor_file_uses_single_dtype"
        self.conn.run_ecl_string(
            "a := DATASET([{{'1'}}, {{'2'}}], {{STRING int;}}); "
            "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
        res = get_thor_file(self.conn, file_name, dtype=int)
        expected = pd.DataFrame({
            "int": [1, 2],
            "__fileposition__": [0, 5]
        },
                                dtype=np.int32)

        pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                      res.sort_index(axis=1))
Exemplo n.º 17
0
 def test_get_thor_file_returns_single_row_dataset(self):
     file_name = "test_get_thor_file_returns_single_row_dataset"
     self.conn.run_ecl_string(
         "a := DATASET([{1}], {INTEGER int;}); "
         "OUTPUT(a,,'~%s');" % file_name, True, True, None)
     res = get_thor_file(
         connection=self.conn,
         thor_file="test_get_thor_file_returns_single_row_dataset")
     expected = pd.DataFrame({
         "int": [1],
         "__fileposition__": 0
     },
                             dtype=np.int32)
     pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                   res.sort_index(axis=1))
Exemplo n.º 18
0
 def test_get_thor_file_works_when_num_rows_equal_to_chunksize(self):
     file_name = "test_get_thor_file_works_when_num_rows_equal_to_chunksize"
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     res = get_thor_file(connection=self.conn,
                         thor_file=file_name,
                         chunk_size=2)
     expected = pd.DataFrame({
         "int": [1, 2],
         "__fileposition__": [0, 8]
     },
                             dtype=np.int32)
     pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                   res.sort_index(axis=1))
Exemplo n.º 19
0
 def test_get_thor_file_works_when_num_rows_greater_than_chunksize(self):
     file_name = ("test_get_thor_file_works_when_num_rows_greater_than_"
                  "chunksize")
     self.conn.run_ecl_string(
         "a := DATASET([{{1}}, {{2}}], {{INTEGER int;}}); "
         "OUTPUT(a,,'~{}');".format(file_name), True, True, None)
     res = get_thor_file(connection=self.conn,
                         thor_file=file_name,
                         chunk_size=1).sort_index(axis=1)
     expected = pd.DataFrame({
         "int": [1, 2],
         "__fileposition__": [0, 8]
     },
                             dtype=np.int32).sort_index(axis=1)
     res = res.sort_values("__fileposition__").reset_index(drop=True)
     pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                   res.sort_index(axis=1),
                                   check_dtype=False)
Exemplo n.º 20
0
    def test_spray_file_df(self):
        thor_file = '~thor::test_spray_file_df'
        overwrite = True
        expire = 1
        delete_workunit = True
        chunk_size = 10000
        max_workers = 3
        conn = hpycc.Connection("user", test_conn=False)

        col_1_values = ['1', '3', '5', '6']
        col_2_values = ['aa', 'ab', 'ac', 'ad']
        df = pd.DataFrame({
            "a": col_1_values,
            "b": col_2_values
        }).sort_values('a')

        spray_file(conn, df, thor_file, overwrite, expire, chunk_size,
                   max_workers, delete_workunit)

        res = get_thor_file(connection=conn, thor_file=thor_file)[['a', 'b']]

        pd.testing.assert_frame_equal(df, res)
Exemplo n.º 21
0
    def test_spray_file_string_smallest_chunks_many_workers(self):
        thor_file = '~test_spray_file_string_smallest_chunks_many_workers'
        overwrite = True
        expire = 1
        delete_workunit = True
        chunk_size = 1
        max_workers = 100
        conn = hpycc.Connection("user", test_conn=False)

        col_1_values = ['1', '3', '5', '7', '9', '11', '13']
        col_2_values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
        df = pd.DataFrame({
            "a": col_1_values,
            "b": col_2_values
        }).sort_values('a').reset_index(drop=True)

        spray_file(conn, df, thor_file, overwrite, expire, chunk_size,
                   max_workers, delete_workunit)

        res = get_thor_file(connection=conn, thor_file=thor_file)[[
            'a', 'b'
        ]].sort_values('a').reset_index(drop=True)

        pd.testing.assert_frame_equal(df, res)