Пример #1
0
 def test_tsv(self):
     result = render_arrow(P(csv="A\tB\na\tb\nc\td"))
     assert_arrow_table_equals(result.table, {
         "A": ["a", "c"],
         "B": ["b", "d"]
     })
     self.assertEqual(result.errors, [])
Пример #2
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        create_module_zipfile(
            "mod",
            python_code=
            ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table"
             ),
        )
        wf_module = workflow.tabs.first().wf_modules.create(
            order=0, slug="step-1", module_id_name="mod")
        cjwstate.modules.init_module_system()
        now = timezone.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id,
                            wf_module_id=wf_module.id,
                            now=now))
        wf_module.refresh_from_db()
        so = wf_module.stored_objects.get(
            stored_at=wf_module.stored_data_version)
        with minio.temporarily_download(minio.StoredObjectsBucket,
                                        so.key) as parquet_path:
            table = pyarrow.parquet.read_table(str(parquet_path),
                                               use_threads=False)
            assert_arrow_table_equals(table, {"A": [1]})

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
Пример #3
0
 def test_no_header(self):
     result = render_arrow(P(csv="A,B", has_header_row=False))
     assert_arrow_table_equals(result.table, {
         "Column 1": ["A"],
         "Column 2": ["B"]
     })
     self.assertEqual(result.errors, [])
Пример #4
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, [])
Пример #5
0
 def test_render_error(self):
     path = self._file(b"A,B\nx,y", suffix=".json")
     result = upload.render_arrow(
         ArrowTable(),
         {
             "file": path,
             "has_header": True
         },
         "tab-x",
         None,
         self.output_path,
     )
     assert_arrow_table_equals(result.table, {})
     self.assertEqual(
         result.errors,
         [
             RenderError(
                 message=I18nMessage(
                     id="TODO_i18n",
                     args={
                         "text":
                         "JSON parse error at byte 0: Invalid value."
                     },
                 ),
                 quick_fixes=[],
             )
         ],
     )
Пример #6
0
 def test_render_fetch_error(self):
     fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))]
     with tempfile_context() as empty_path:
         with self.render(P(), FetchResult(empty_path,
                                           fetch_errors)) as result:
             assert_arrow_table_equals(result.table, ArrowTable())
             self.assertEqual(result.errors, fetch_errors)
Пример #7
0
 def test_detect_semicolon_csv_by_suffix(self):
     with _data_file(b"A;B\nx;y\nz;a", suffix=".txt") as txt_path:
         result = parse_file(txt_path, output_path=self.output_path)
     assert_arrow_table_equals(result.table, {
         "A": ["x", "z"],
         "B": ["y", "a"]
     })
Пример #8
0
 def test_detect_tsv_by_suffix(self):
     with _data_file(b"A\tB\nx\ty\nz\ta", suffix=".tsv") as tsv_path:
         result = parse_file(tsv_path, output_path=self.output_path)
     assert_arrow_table_equals(result.table, {
         "A": ["x", "z"],
         "B": ["y", "a"]
     })
Пример #9
0
 def test_detect_csv_by_suffix(self):
     with _data_file(b"A,B\nx,y\nz,a", suffix=".csv") as csv_path:
         result = parse_file(csv_path, output_path=self.output_path)
     assert_arrow_table_equals(result.table, {
         "A": ["x", "z"],
         "B": ["y", "a"]
     })
Пример #10
0
 def test_mime_type_overrides_suffix(self):
     # File is ".csv" but we parse as ".json" because mime_type=MimeType.JSON
     with _data_file(b'[{"X":"x"}]', suffix=".csv") as json_path:
         result = parse_file(json_path,
                             output_path=self.output_path,
                             mime_type=MimeType.JSON)
     assert_arrow_table_equals(result.table, {"X": ["x"]})
Пример #11
0
 def test_detect_xlsx_by_suffix(self):
     result = parse_file(TestDataPath / "test.xlsx",
                         output_path=self.output_path)
     assert_arrow_table_equals(result.table, {
         "Month": ["Jan", "Feb"],
         "Amount": [10, 20]
     })
Пример #12
0
    def test_fetch_integration(self, send_update, queue_render):
        queue_render.side_effect = async_value(None)
        send_update.side_effect = async_value(None)
        workflow = Workflow.create_and_init()
        ModuleVersion.create_or_replace_from_spec(
            {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []},
            source_version_hash="abc123",
        )
        wf_module = workflow.tabs.first().wf_modules.create(
            order=0, slug="step-1", module_id_name="mod"
        )
        minio.put_bytes(
            minio.ExternalModulesBucket,
            "mod/abc123/code.py",
            b"import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table",
        )
        cjwstate.modules.init_module_system()
        now = timezone.now()
        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now)
            )
        wf_module.refresh_from_db()
        so = wf_module.stored_objects.get(stored_at=wf_module.stored_data_version)
        with minio.temporarily_download(so.bucket, so.key) as parquet_path:
            table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False)
            assert_arrow_table_equals(table, {"A": [1]})

        workflow.refresh_from_db()
        queue_render.assert_called_with(workflow.id, workflow.last_delta_id)
        send_update.assert_called()
Пример #13
0
 def render(*args, fetch_result, **kwargs):
     self.assertEqual(
         fetch_result.errors,
         [RenderError(I18nMessage.TODO_i18n("maybe an error"))],
     )
     assert_arrow_table_equals(
         pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]})
     return RenderResult()
Пример #14
0
 def test_no_nan(self):
     # https://www.pivotaltracker.com/story/show/163106728
     result = render_arrow(P(csv="A,B\nx,y\nz,NA"))
     assert_arrow_table_equals(result.table, {
         "A": ["x", "z"],
         "B": ["y", "NA"]
     })
     self.assertEqual(result.errors, [])
Пример #15
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, [])
Пример #16
0
 def test_empty_column_name_gets_automatic_name(self):
     result = render_arrow(P(csv="A,,B\na,b,c", has_header_row=True))
     assert_arrow_table_equals(result.table, {
         "A": ["a"],
         "Column 2": ["b"],
         "B": ["c"]
     })
     self.assertEqual(result.errors, [])
Пример #17
0
 def test_read_issue_375_snappy(self):
     assert_arrow_table_equals(
         parquet.read(self._testPath("fastparquet-issue-375-snappy.par")),
         {
             "A": ["A" * 32760] * 10,
             "__index_level_0__": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
         },
     )
Пример #18
0
 def test_json_detect_encoding_by_default(self):
     with _data_file('[{"A":"café"}]'.encode("windows-1252")) as path:
         result = parse_file(
             path,
             output_path=self.output_path,
             mime_type=MimeType.JSON,
             encoding=None,
         )
     assert_arrow_table_equals(result.table, {"A": ["café"]})
Пример #19
0
 def test_dataframe_all_null_text_column(self):
     assert_arrow_table_equals(
         dataframe_to_arrow_table(
             pd.DataFrame({"A": [None]}, dtype=str),
             [Column("A", ColumnType.TEXT())],
             self.path,
         ),
         arrow_table({"A": pyarrow.array([None], pyarrow.string())}),
     )
Пример #20
0
    def test_render_arrow_table(self):
        # The param name "arrow_table" is a special case
        def render(arrow_table, params, output_path, **kwargs):
            out = pa.table({"A": [2]})
            with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer:
                writer.write_table(out)

        result = self._test_render(render, {"A": [1]})
        assert_arrow_table_equals(result.table, {"A": [2]})
Пример #21
0
 def test_xlsx_nix_control_characters_from_colnames(self):
     path = TestDataPath / "headers-have-control-characters.xlsx"
     with tempfile_context(suffix=".arrow") as output_path:
         result = parse_xlsx_file(path,
                                  output_path=output_path,
                                  has_header=True,
                                  autoconvert_types=False)
     assert_arrow_table_equals(result.table, {"AB": ["a"], "C": ["b"]})
     self.assertEqual(result.errors, [])
Пример #22
0
 def test_xlsx_cast_colnames_to_str(self):
     path = TestDataPath / "all-numeric.xlsx"
     with tempfile_context(suffix=".arrow") as output_path:
         result = parse_xlsx_file(path,
                                  output_path=output_path,
                                  has_header=True,
                                  autoconvert_types=True)
     assert_arrow_table_equals(result.table, {"1": [2]})
     self.assertEqual(result.errors, [])
Пример #23
0
 def test_render_deprecated_parquet_warning(self):
     errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))]
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path, errors)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, errors)
Пример #24
0
 def _test_read_write_table(self, table, expected=None):
     table = arrow_table(table).table
     if expected is None:
         expected = table
     else:
         expected = arrow_table(expected).table
     parquet.write(self.temp_path, table)
     result = parquet.read(self.temp_path)
     assert_arrow_table_equals(result, table)
Пример #25
0
 def test_xls(self):
     path = TestDataPath / "example.xls"
     with tempfile_context(suffix=".arrow") as output_path:
         result = parse_xls_file(path,
                                 output_path=output_path,
                                 has_header=True,
                                 autoconvert_types=True)
     assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]})
     self.assertEqual(result.errors, [])
Пример #26
0
 def test_csv_has_header_false(self):
     with _data_file(b"A\n1.00\n2") as path:
         result = parse_file(
             path,
             output_path=self.output_path,
             mime_type=MimeType.CSV,
             has_header=False,
         )
     assert_arrow_table_equals(result.table,
                               {"Column 1": ["A", "1.00", "2"]})
Пример #27
0
 def test_json_override_encoding_by_argument(self):
     # caller-selected encoding overrides autodetected encoding
     with _data_file('[{"A":"café"}]'.encode("utf-8")) as path:
         result = parse_file(
             path,
             output_path=self.output_path,
             mime_type=MimeType.JSON,
             encoding="windows-1252",
         )
     assert_arrow_table_equals(result.table, {"A": ["café"]})
Пример #28
0
    def test_render_truncate(self):
        def render(table, params):
            return pd.DataFrame({"A": [1, 2, 3]})

        result = self._test_render(render)
        assert_arrow_table_equals(result.table, {"A": [1, 2]})
        self.assertEqual(
            result.errors,
            [RenderError(I18nMessage.TODO_i18n("Truncated output from 3 rows to 2"))],
        )
Пример #29
0
 def test_read_issue_375_uncompressed(self):
     # https://github.com/dask/fastparquet/issues/375
     # large dictionary written by pyarrow.parquet.
     assert_arrow_table_equals(
         parquet.read(self._testPath("fastparquet-issue-375.par")),
         {
             "A": ["A" * 32755] * 10,
             "__index_level_0__": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
         },
     )
Пример #30
0
 def test_duplicate_column_names_renamed(self):
     result = render_arrow(P(csv="A,A\na,b", has_header_row=True))
     assert_arrow_table_equals(result.table, {"A": ["a"], "A 2": ["b"]})
     self.assertEqual(
         result.errors,
         [
             RenderError(
                 I18nMessage.TODO_i18n(
                     "Renamed 1 duplicate column names (see “A 2”)"))
         ],
     )