Exemplo n.º 1
0
 def test_parquet_same_data_different_bytes(self):
     cjwparquet.write(self.old_path, make_table(make_column("A", ["a"])))
     cjwparquet.write(self.new_path,
                      make_table(make_column("A", ["a"], dictionary=True)))
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Exemplo n.º 2
0
 def test_simple(self):
     self.kernel.fetch.return_value = FetchResult(self.output_path)
     module_zipfile = create_module_zipfile(
         "mod",
         spec_kwargs={"parameters": [{
             "id_name": "A",
             "type": "string"
         }]})
     with self.assertLogs("fetcher.fetch", level=logging.INFO):
         result = fetch.fetch_or_wrap_error(
             self.ctx,
             self.chroot_context,
             self.basedir,
             "mod",
             module_zipfile,
             {"A": "B"},
             {"C": "D"},
             None,
             None,
             self.output_path,
         )
     self.assertEqual(result, FetchResult(self.output_path, []))
     self.assertEqual(
         self.kernel.fetch.call_args[1]["compiled_module"],
         module_zipfile.compile_code_without_executing(),
     )
     self.assertEqual(self.kernel.fetch.call_args[1]["params"],
                      Params({"A": "B"}))
     self.assertEqual(self.kernel.fetch.call_args[1]["secrets"], {"C": "D"})
     self.assertIsNone(self.kernel.fetch.call_args[1]["last_fetch_result"])
     self.assertIsNone(
         self.kernel.fetch.call_args[1]["input_parquet_filename"])
Exemplo n.º 3
0
    def test_pass_last_fetch_result(self, downloaded_file):
        last_result_path = self.ctx.enter_context(
            tempfile_context(prefix="last-result")
        )

        result_path = self.ctx.enter_context(tempfile_context(prefix="result"))

        self.kernel.fetch.return_value = FetchResult(result_path, [])
        with self.assertLogs("fetcher.fetch", level=logging.INFO):
            fetch.fetch_or_wrap_error(
                self.ctx,
                self.chroot_context,
                self.basedir,
                "mod",
                create_module_zipfile("mod"),
                {},
                {},
                FetchResult(last_result_path, []),
                None,
                self.output_path,
            )
        self.assertEqual(
            self.kernel.fetch.call_args[1]["last_fetch_result"],
            FetchResult(last_result_path, []),
        )
Exemplo n.º 4
0
 def test_simple(self, load_module):
     load_module.return_value.migrate_params.return_value = {"A": "B"}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     result = fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(params={"A": "input"}, secrets={"C": "wrong"}),
         MockModuleVersion(
             id_name="A", param_schema=ParamDType.Dict({"A": ParamDType.String()})
         ),
         {"C": "D"},
         None,
         None,
         self.output_path,
     )
     self.assertEqual(result, FetchResult(self.output_path, []))
     load_module.return_value.migrate_params.assert_called_with({"A": "input"})
     load_module.return_value.fetch.assert_called_with(
         chroot_context=self.chroot_context,
         basedir=self.basedir,
         params=Params({"A": "B"}),
         secrets={"C": "D"},
         last_fetch_result=None,
         input_parquet_filename=None,
         output_filename=self.output_path.name,
     )
Exemplo n.º 5
0
 def test_different_errors(self):
     self.assertFalse(
         are_fetch_results_equal(
             FetchResult(self.old_path,
                         [RenderError(I18nMessage("foo", {}, None))]),
             FetchResult(self.old_path,
                         [RenderError(I18nMessage("bar", {}, None))]),
         ))
Exemplo n.º 6
0
 def test_bytes_different(self):
     self.old_path.write_bytes(
         b"slakdjhgt34kj5hlekretjhse3lk4j5ho234kj5rthsadf")
     self.new_path.write_bytes(
         b"salkdfhgbo324iu5q34rlkiuw3e47ytedasdfgaksjhg3r")
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Exemplo n.º 7
0
 def test_parquet_same_data_different_bytes(self):
     parquet.write(self.old_path, arrow_table({"A": ["a"]}).table)
     parquet.write(
         self.new_path,
         arrow_table({
             "A": pyarrow.array(["a"]).dictionary_encode()
         }).table,
     )
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Exemplo n.º 8
0
async def do_download(
    sheet_id: str, sheet_mime_type: str, oauth2_client: oauth2.Client, output_path: Path
) -> FetchResult:
    """
    Download spreadsheet from Google.

    If `sheet_mime_type` is 'application/vnd.google-apps.spreadsheet', use
    GDrive API to _export_ a text/csv. Otherwise, use GDrive API to _download_
    the file.
    """
    if sheet_mime_type == "application/vnd.google-apps.spreadsheet":
        url = _generate_google_sheet_url(sheet_id)
        sheet_mime_type = "text/csv"
    else:
        url = _generate_gdrive_file_url(sheet_id)
        # and use the passed sheet_mime_type

    url, headers, _ = oauth2_client.add_token(url, headers={})

    try:
        await httpfile.download(url, output_path, headers=headers, ssl=SSL_CONTEXT)
    except HttpError.NotSuccess as err:
        response = err.response
        if response.status_code == 401:
            return TODO_i18n_fetch_error(
                output_path, "Invalid credentials. Please reconnect to Google Drive."
            )
        elif response.status_code == 403:
            return TODO_i18n_fetch_error(
                output_path,
                "You chose a file your logged-in user cannot access. Please reconnect to Google Drive or choose a different file.",
            )
        elif response.status_code == 404:
            return TODO_i18n_fetch_error(
                output_path, "File not found. Please choose a different file."
            )
        else:
            # HACK: *err.i18n_message because i18n_message is a tuple
            # compatible with I18nMessage() ctor
            return FetchResult(
                output_path, errors=[RenderError(I18nMessage(*err.i18n_message))]
            )
    except HttpError as err:
        # HACK: *err.i18n_message because i18n_message is a tuple
        # compatible with I18nMessage() ctor
        return FetchResult(
            output_path, errors=[RenderError(I18nMessage(*err.i18n_message))]
        )

    return FetchResult(output_path)
Exemplo n.º 9
0
    def test_fetch_get_stored_dataframe_happy_path(self):
        async def fetch(params, *, get_stored_dataframe):
            df = await get_stored_dataframe()
            assert_frame_equal(df, pd.DataFrame({"A": [1]}))

        with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path:
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
Exemplo n.º 10
0
    def test_fetch_truncate(self):
        def fetch(params):
            return pd.DataFrame({"A": [1, 2, 3]})

        with tempfile_context(dir=self.basedir) as outfile:
            result = self._test_fetch(fetch, output_filename=outfile.name)
            self.assertEqual(
                result,
                FetchResult(
                    outfile,
                    errors=[
                        FetchError(
                            I18nMessage(
                                "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                                {
                                    "old_number": 3,
                                    "new_number": 2
                                },
                                None,
                            ))
                    ],
                ),
            )
            assert_arrow_table_equals(
                read_parquet_as_arrow(
                    outfile, [Column("A", ColumnType.Number("{:,}"))]),
                make_table(make_column("A", [1, 2])),
            )
Exemplo n.º 11
0
 def test_render_xlsx_bad_content(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", XLSX_MIME_TYPE)],
             io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(
         result,
         RenderResult(
             ArrowTable(),
             [
                 RenderError(
                     I18nMessage.TODO_i18n(
                         'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"'
                     )
                 )
             ],
         ),
     )
Exemplo n.º 12
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, [])
Exemplo n.º 13
0
 def _test_fetch(
     self,
     fetch_fn,
     *,
     params={},
     secrets={},
     last_fetch_result=None,
     input_table_parquet_path=None,
     output_filename=None,
 ):
     with ExitStack() as ctx:
         ctx.enter_context(patch.object(module, "fetch", fetch_fn))
         if output_filename is None:
             # Make a temporary output filename -- this will make `fetch()`
             # complete, but callers won't be able to see the data it
             # outputs because we'll delete the file too soon.
             output_filename = ctx.enter_context(
                 tempfile_context(dir=self.basedir)).name
         thrift_result = module.fetch_thrift(
             ttypes.FetchRequest(
                 basedir=str(self.basedir),
                 params=Params(params).to_thrift(),
                 secrets=RawParams(secrets).to_thrift(),
                 last_fetch_result=(last_fetch_result.to_thrift()
                                    if last_fetch_result is not None else
                                    None),
                 input_table_parquet_filename=(input_table_parquet_path.name
                                               if input_table_parquet_path
                                               is not None else None),
                 output_filename=output_filename,
             ))
         return FetchResult.from_thrift(thrift_result, self.basedir)
Exemplo n.º 14
0
 def test_input_crr_corrupt_cache_error_is_none(
     self, downloaded_parquet_file, load_module
 ):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     downloaded_parquet_file.side_effect = rendercache.CorruptCacheError(
         "file not found"
     )
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # fetch is still called, with `None` as argument.
     self.assertIsNone(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"]
     )
Exemplo n.º 15
0
 def test_input_crr(self, downloaded_parquet_file, clean_value, load_module):
     load_module.return_value.migrate_params.return_value = {}
     load_module.return_value.fetch.return_value = FetchResult(self.output_path, [])
     clean_value.return_value = {}
     downloaded_parquet_file.return_value = Path("/path/to/x.parquet")
     input_metadata = TableMetadata(3, [Column("A", ColumnType.Text())])
     input_crr = CachedRenderResult(1, 2, 3, "ok", [], {}, input_metadata)
     fetch.fetch_or_wrap_error(
         self.ctx,
         self.chroot_context,
         self.basedir,
         WfModule(),
         MockModuleVersion(),
         {},
         None,
         input_crr,
         self.output_path,
     )
     # Passed file is downloaded from rendercache
     downloaded_parquet_file.assert_called_with(input_crr, dir=self.basedir)
     self.assertEqual(
         load_module.return_value.fetch.call_args[1]["input_parquet_filename"],
         "x.parquet",
     )
     # clean_value() is called with input metadata from CachedRenderResult
     clean_value.assert_called()
     self.assertEqual(clean_value.call_args[0][2], input_metadata)
Exemplo n.º 16
0
    def test_fetch_get_stored_dataframe_empty_file_is_empty_table(self):
        async def fetch(params, *, get_stored_dataframe):
            df = await get_stored_dataframe()
            assert_frame_equal(df, pd.DataFrame())

        with tempfile_context(dir=self.basedir) as parquet_path:
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
Exemplo n.º 17
0
 def test_render_fetch_error(self):
     fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))]
     with tempfile_context() as empty_path:
         with self.render(P(), FetchResult(empty_path,
                                           fetch_errors)) as result:
             assert_arrow_table_equals(result.table, ArrowTable())
             self.assertEqual(result.errors, fetch_errors)
Exemplo n.º 18
0
def _stored_object_to_fetch_result(
    ctx: contextlib.ExitStack,
    stored_object: Optional[StoredObject],
    wf_module_fetch_error: str,
    dir: Path,
) -> Optional[FetchResult]:
    """
    Given a StoredObject (or None), return a FetchResult (or None).

    This cannot error. Any errors lead to a `None` return value.
    """
    if stored_object is None:
        return None
    else:
        try:
            last_fetch_path = ctx.enter_context(
                storedobjects.downloaded_file(stored_object, dir=dir))
            if wf_module_fetch_error:
                errors = [
                    RenderError(I18nMessage.TODO_i18n(wf_module_fetch_error))
                ]
            else:
                errors = []
            return FetchResult(last_fetch_path, errors)
        except FileNotFoundError:
            return None
Exemplo n.º 19
0
    def test_render_empty_file_fetch_result_is_parquet(self):
        def render(*args, fetch_result):
            return fetch_result.dataframe

        with tempfile_context(dir=self.basedir) as tf:
            result = self._test_render(render, fetch_result=FetchResult(tf))
            assert_render_result_equals(result, RenderResult(arrow_table({})))
Exemplo n.º 20
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, [])
Exemplo n.º 21
0
    def test_render_with_parquet_fetch_result(self):
        def render(*args, fetch_result):
            return fetch_result

        with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf:
            result = self._test_render(render, fetch_result=FetchResult(pf))
            assert_render_result_equals(
                result, RenderResult(arrow_table({"A": ["fetched"]})))
Exemplo n.º 22
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result,
    input_table_parquet_path,
    output_path: Path,
) -> FetchResult:
    file_meta = params["file"]
    if not file_meta:
        return FetchResult(
            output_path,
            errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))],
        )

    # Ignore file_meta['url']. That's for the client's web browser, not for
    # an API request.
    sheet_id = file_meta["id"]
    if not sheet_id:
        # [adamhooper, 2019-12-06] has this ever happened?
        return FetchResult(
            output_path,
            errors=[RenderError(I18nMessage.TODO_i18n("Please choose a file"))],
        )

    # backwards-compat for old entries without 'mimeType', 2018-06-13
    sheet_mime_type = file_meta.get(
        "mimeType", "application/vnd.google-apps.spreadsheet"
    )

    secret = secrets.get("google_credentials")
    if not secret:
        return TODO_i18n_fetch_error(output_path, "Please connect to Google Drive.")
    if "error" in secret:
        return FetchResult(
            output_path, errors=[RenderError(I18nMessage.from_dict(secret["error"]))]
        )
    assert "secret" in secret
    oauth2_client = oauth2.Client(
        client_id=None,  # unneeded
        token_type=secret["secret"]["token_type"],
        access_token=secret["secret"]["access_token"],
    )

    return asyncio.run(
        do_download(sheet_id, sheet_mime_type, oauth2_client, output_path)
    )
Exemplo n.º 23
0
 def test_render_deprecated_parquet_warning(self):
     errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))]
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path, errors)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, errors)
Exemplo n.º 24
0
    def test_render_with_non_parquet_fetch_result(self):
        def render(*args, fetch_result):
            return pd.DataFrame({"A": [fetch_result.path.read_text()]})

        with tempfile_context(dir=self.basedir) as tf:
            tf.write_bytes(b"abcd")
            result = self._test_render(render, fetch_result=FetchResult(tf))
            assert_render_result_equals(
                result, RenderResult(arrow_table({"A": ["abcd"]})))
Exemplo n.º 25
0
    def test_storage_limits(self, limit):
        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                            slug="step-1")

        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(workflow.id, wf_module,
                                   FetchResult(parquet_path), timezone.now()))
        limit.assert_called_with(wf_module)
Exemplo n.º 26
0
    def test_render_empty_file_fetch_result_is_parquet(self):
        def render(table, params, *, fetch_result):
            assert_frame_equal(fetch_result.dataframe, pd.DataFrame({}))
            return fetch_result.dataframe

        with ModuleTestEnv(render=render) as env:
            with tempfile_context(dir=env.basedir) as tf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(tf))
                self.assertEqual(outcome.read_table(), make_table())
Exemplo n.º 27
0
    def test_fetch_get_stored_dataframe_unhandled_parquet_is_error(self):
        # Why an error? So module authors can handle it. They _created_ the
        # problem, after all. Let's help them detect it.
        async def fetch(params, *, get_stored_dataframe):
            with self.assertRaises(pa.ArrowIOError):
                await get_stored_dataframe()

        with tempfile_context(dir=self.basedir) as parquet_path:
            parquet_path.write_bytes(b"12345")
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
Exemplo n.º 28
0
def _load_fetch_result(
    wf_module: WfModule, basedir: Path, exit_stack: contextlib.ExitStack
) -> Optional[FetchResult]:
    """
    Download user-selected StoredObject to `basedir`, so render() can read it.

    Edge cases:

    Create no file (and return `None`) if the user did not select a
    StoredObject, or if the selected StoredObject does not point to a file
    on minio.

    The caller should ensure "leave `path` alone" means "return an empty
    FetchResult". The FetchResult may still have an error.
    """
    try:
        stored_object = wf_module.stored_objects.get(
            stored_at=wf_module.stored_data_version
        )
    except StoredObject.DoesNotExist:
        return None
    if not stored_object.bucket or not stored_object.key:
        return None

    with contextlib.ExitStack() as inner_stack:
        path = inner_stack.enter_context(
            tempfile_context(prefix="fetch-result-", dir=basedir)
        )

        try:
            minio.download(stored_object.bucket, stored_object.key, path)
            # Download succeeded, so we no longer want to delete `path`
            # right _now_ ("now" means, "in inner_stack.close()"). Instead,
            # transfer ownership of `path` to exit_stack.
            exit_stack.callback(inner_stack.pop_all().close)
        except FileNotFoundError:
            # A few StoredObjects -- very old ones with size=0 -- are
            # *intentionally* not in minio. It turns out modules from that era
            # treated empty-file and None as identical. The _modules_ must
            # preserve that logic for backwards compatibility; so it's safe to
            # return `None` here.
            #
            # Other than that, if the file doesn't exist it's a race: either
            # the fetch result is too _new_ (it's in the database but its file
            # hasn't been written yet) or the fetch result is half-deleted (its
            # file was deleted and it's still in the database). In either case,
            # pretend the fetch result does not exist in the database -- i.e.,
            # return `None`.
            return None

    if wf_module.fetch_error:
        errors = [RenderError(I18nMessage.TODO_i18n(wf_module.fetch_error))]
    else:
        errors = []
    return FetchResult(path, errors)
Exemplo n.º 29
0
    def test_race_hard_deleted_wf_module(self):
        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                            slug="step-1")
        WfModule.objects.filter(id=wf_module.id).delete()

        # Don't crash
        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(workflow.id, wf_module,
                                   FetchResult(parquet_path), timezone.now()))
Exemplo n.º 30
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets,
    last_fetch_result,
    input_table_parquet_path,
    output_path: Path,
) -> FetchResult:
    url: str = params["url"].strip()
    mimetypes = ",".join(v.value for v in AllowedMimeTypes)
    headers = [("Accept", mimetypes)]

    try:
        asyncio.run(httpfile.download(url, output_path, headers=headers))
    except HttpError as err:
        # HACK: *err.i18n_message because i18n_message is a tuple
        # compatible with I18nMessage() ctor
        return FetchResult(
            output_path, errors=[RenderError(I18nMessage(*err.i18n_message))])

    return FetchResult(output_path)