示例#1
0
    def test_execute_mark_unreachable(self, send_update):
        future_none = asyncio.Future()
        future_none.set_result(None)
        send_update.return_value = future_none

        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        create_module_zipfile(
            "mod",
            spec_kwargs={"loads_data": True},
            python_code=
            'def render(table, params): return "error, not warning"',
        )
        step1 = tab.steps.create(order=0, slug="step-1", module_id_name="mod")
        step2 = tab.steps.create(order=1, slug="step-2", module_id_name="mod")
        step3 = tab.steps.create(order=2, slug="step-3", module_id_name="mod")

        self._execute(workflow)

        # step1: error
        step1.refresh_from_db()
        with open_cached_render_result(step1.cached_render_result) as result:
            self.assertEqual(result.path.read_bytes(), b"")
            self.assertEqual(
                step1.cached_render_result.errors,
                [RenderError(TODO_i18n("error, not warning"))],
            )

        # step2, step3: unreachable (no errors, no table data)
        step2.refresh_from_db()
        self.assertEqual(step2.cached_render_result.status, "unreachable")
        with open_cached_render_result(step2.cached_render_result) as result:
            self.assertEqual(result.path.read_bytes(), b"")
            self.assertEqual(step2.cached_render_result.errors, [])

        step3.refresh_from_db()
        with open_cached_render_result(step3.cached_render_result) as result:
            self.assertEqual(result.path.read_bytes(), b"")
            self.assertEqual(step3.cached_render_result.errors, [])

        send_update.assert_called_with(
            workflow.id,
            clientside.Update(
                steps={
                    step3.id:
                    clientside.StepUpdate(
                        render_result=step3.cached_render_result,
                        module_slug="mod")
                }),
        )
示例#2
0
    def test_execute_new_revision(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        create_module_zipfile(
            "mod",
            spec_kwargs={"loads_data": True},
            python_code=
            'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})',
        )
        step = tab.steps.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=1,
            module_id_name="mod",
        )
        cache_render_result(workflow, step, 1,
                            RenderResult(arrow_table({"A": [1]})))
        step.last_relevant_delta_id = 2
        step.save(update_fields=["last_relevant_delta_id"])

        self._execute(workflow)

        step.refresh_from_db()

        with open_cached_render_result(step.cached_render_result) as result:
            assert_render_result_equals(result,
                                        RenderResult(arrow_table({"B": [2]})))
    def test_execute_new_revision(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta1 = workflow.last_delta
        create_module_zipfile(
            "mod",
            python_code=
            'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})',
        )
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta1.id,
            module_id_name="mod",
        )

        result1 = RenderResult(arrow_table({"A": [1]}))
        cache_render_result(workflow, wf_module, delta1.id, result1)

        delta2 = InitWorkflowCommand.create(workflow)
        wf_module.last_relevant_delta_id = delta2.id
        wf_module.save(update_fields=["last_relevant_delta_id"])

        self._execute(workflow)

        wf_module.refresh_from_db()

        with open_cached_render_result(
                wf_module.cached_render_result) as result:
            assert_render_result_equals(result,
                                        RenderResult(arrow_table({"B": [2]})))
示例#4
0
    def test_execute_new_revision(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        create_module_zipfile(
            "mod",
            spec_kwargs={"loads_data": True},
            python_code=
            'import pandas as pd\ndef render(table, params): return pd.DataFrame({"B": [2]})',
        )
        step = tab.steps.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=2,
            module_id_name="mod",
        )
        # stale
        write_to_rendercache(workflow, step, 1,
                             make_table(make_column("A", ["a"])))

        self._execute(workflow)

        step.refresh_from_db()

        with open_cached_render_result(step.cached_render_result) as result:
            assert_arrow_table_equals(result.table,
                                      make_table(make_column("B", [2])))
示例#5
0
def _execute_wfmodule_save(
    workflow: Workflow, wf_module: WfModule, result: RenderResult
) -> SaveResult:
    """
    Call rendercache.cache_render_result() and build notifications.OutputDelta.

    All this runs synchronously within a database lock. (It's a separate
    function so that when we're done awaiting it, we can continue executing in
    a context that doesn't use a database thread.)

    Raise UnneededExecution if the WfModule has changed in the interim.
    """
    # raises UnneededExecution
    with locked_wf_module(workflow, wf_module) as safe_wf_module:
        if safe_wf_module.notifications:
            stale_crr = safe_wf_module.get_stale_cached_render_result()
            if stale_crr is None:
                stale_result = None
            else:
                try:
                    # Read entire old Parquet file, blocking
                    with rendercache.open_cached_render_result(
                        stale_crr
                    ) as stale_result:
                        pass  # stale_result is deleted from disk but still mmapped
                except rendercache.CorruptCacheError:
                    # No, let's not send an email. Corrupt cache probably means
                    # we've been messing with our codebase.
                    logger.exception(
                        "Ignoring CorruptCacheError on workflow %d, wf_module %d because we are about to overwrite it",
                        workflow.id,
                        wf_module.id,
                    )
                    stale_result = None
        else:
            stale_result = None

        rendercache.cache_render_result(
            workflow, safe_wf_module, wf_module.last_relevant_delta_id, result
        )

        if (
            safe_wf_module.notifications
            and stale_result is not None
            and result != stale_result
        ):
            safe_wf_module.has_unseen_notification = True
            safe_wf_module.save(update_fields=["has_unseen_notification"])
            maybe_delta = notifications.OutputDelta(
                safe_wf_module.workflow.owner,
                safe_wf_module.workflow,
                safe_wf_module,
                stale_result,
                result,
            )
        else:
            maybe_delta = None  # nothing to email
        return SaveResult(safe_wf_module.cached_render_result, maybe_delta)
示例#6
0
def wfmodule_value_counts(request: HttpRequest, wf_module: WfModule):
    try:
        colname = request.GET["column"]
    except KeyError:
        return JsonResponse({"error": 'Missing a "column" parameter'}, status=400)

    if not colname:
        # User has not yet chosen a column. Empty response.
        return JsonResponse({"values": {}})

    cached_result = wf_module.cached_render_result
    if cached_result is None:
        # assume we'll get another request after execute finishes
        return JsonResponse({"values": {}})

    try:
        column_index, column = next(
            (i, c)
            for i, c in enumerate(cached_result.table_metadata.columns)
            if c.name == colname
        )
    except StopIteration:
        return JsonResponse({"error": f'column "{colname}" not found'}, status=404)

    # raise CorruptCacheError
    try:
        with open_cached_render_result(cached_result) as result:
            arrow_table = result.table.table
            # series may be of any type, not just str/categorical
            series = arrow_table.column(column_index).to_pandas(
                deduplicate_objects=True, ignore_metadata=True
            )
    except CorruptCacheError:
        # We _could_ return an empty result set; but our only goal here is
        # "don't crash" and this 404 seems to be the simplest implementation.
        # (We assume that if the data is deleted, the user has moved elsewhere
        # and this response is going to be ignored.)
        return JsonResponse({"error": f'column "{colname}" not found'}, status=404)

    # We only handle string. If it's not string, convert to string. (Rationale:
    # this is used in Refine and Filter by Value, which are both solely
    # String-based for now. Excel and Google Sheets only filter by String
    # values, so we're in good company.) Remember: in JavaScript, Object keys
    # must be String.
    series = ptypes.ColumnType.from_arrow(column.type).format_series(series)
    value_counts = series.value_counts().to_dict()

    return JsonResponse({"values": value_counts})
示例#7
0
    def test_resume_without_rerunning_unneeded_renders(self, fake_load_module):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta_id = workflow.last_delta_id
        ModuleVersion.create_or_replace_from_spec({
            "id_name": "mod",
            "name": "Mod",
            "category": "Clean",
            "parameters": []
        })

        # wf_module1: has a valid, cached result
        wf_module1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        cache_render_result(workflow, wf_module1, delta_id,
                            RenderResult(arrow_table({"A": [1]})))

        # wf_module2: has no cached result (must be rendered)
        wf_module2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )

        fake_loaded_module = Mock(LoadedModule)
        fake_loaded_module.migrate_params.return_value = {}
        fake_load_module.return_value = fake_loaded_module
        result2 = RenderResult(arrow_table({"A": [2]}))

        fake_loaded_module.render.return_value = result2
        self._execute(workflow)
        fake_loaded_module.render.assert_called_once()  # only with module2

        wf_module2.refresh_from_db()
        with open_cached_render_result(
                wf_module2.cached_render_result) as actual:
            assert_render_result_equals(actual, result2)
示例#8
0
    def test_resume_without_rerunning_unneeded_renders(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta_id = workflow.last_delta_id
        create_module_zipfile(
            # If this runs on step1, it'll return pd.DataFrame().
            # If this runs on step2, it'll return step1-output * 2.
            # ... step2's output depends on whether we run this on
            # step1.
            "mod",
            spec_kwargs={"loads_data": True},
            python_code="def render(table, params): return table * 2",
        )

        # step1: has a valid, cached result
        step1 = tab.steps.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=1,
            module_id_name="mod",
        )
        cache_render_result(workflow, step1, 1,
                            RenderResult(arrow_table({"A": [1]})))

        # step2: has no cached result (must be rendered)
        step2 = tab.steps.create(
            order=1,
            slug="step-2",
            last_relevant_delta_id=1,
            module_id_name="mod",
        )

        self._execute(workflow)

        step2.refresh_from_db()
        with open_cached_render_result(step2.cached_render_result) as actual:
            assert_render_result_equals(actual,
                                        RenderResult(arrow_table({"A": [2]})))
示例#9
0
    def test_execute_new_revision(self, fake_load_module):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta1 = workflow.last_delta
        ModuleVersion.create_or_replace_from_spec({
            "id_name": "mod",
            "name": "Mod",
            "category": "Clean",
            "parameters": []
        })
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta1.id,
            module_id_name="mod",
        )

        result1 = RenderResult(arrow_table({"A": [1]}))
        cache_render_result(workflow, wf_module, delta1.id, result1)

        delta2 = InitWorkflowCommand.create(workflow)
        wf_module.last_relevant_delta_id = delta2.id
        wf_module.save(update_fields=["last_relevant_delta_id"])

        result2 = RenderResult(arrow_table({"B": [2]}))
        fake_module = Mock(LoadedModule)
        fake_module.migrate_params.return_value = {}
        fake_load_module.return_value = fake_module
        fake_module.render.return_value = result2

        self._execute(workflow)

        wf_module.refresh_from_db()

        with open_cached_render_result(
                wf_module.cached_render_result) as result:
            assert_render_result_equals(result, result2)
示例#10
0
def wfmodule_value_counts(request: HttpRequest, wf_module: WfModule):
    try:
        colname = request.GET["column"]
    except KeyError:
        return JsonResponse({"error": 'Missing a "column" parameter'},
                            status=400)

    if not colname:
        # User has not yet chosen a column. Empty response.
        return JsonResponse({"values": {}})

    cached_result = wf_module.cached_render_result
    if cached_result is None:
        # assume we'll get another request after execute finishes
        return JsonResponse({"values": {}})

    try:
        column_index, column = next(
            (i, c) for i, c in enumerate(cached_result.table_metadata.columns)
            if c.name == colname)
    except StopIteration:
        return JsonResponse({"error": f'column "{colname}" not found'},
                            status=404)

    if not isinstance(column.type, ColumnType.Text):
        # We only return text values.
        #
        # Rationale: this is only used in Refine and Filter by Value. Both
        # force text. The user can query a column before it's converted to
        # text; but if he/she does, we shouldn't format as text unless we have
        # a viable workflow that needs it. (Better would be to force the user
        # to convert to text before doing anything else, no?)
        return JsonResponse({"values": {}})

    try:
        # raise CorruptCacheError
        with open_cached_render_result(cached_result) as result:
            arrow_table = result.table.table
            chunked_array = arrow_table.column(column_index)
    except CorruptCacheError:
        # We _could_ return an empty result set; but our only goal here is
        # "don't crash" and this 404 seems to be the simplest implementation.
        # (We assume that if the data is deleted, the user has moved elsewhere
        # and this response is going to be ignored.)
        return JsonResponse({"error": f'column "{colname}" not found'},
                            status=404)

    if chunked_array.num_chunks == 0:
        value_counts = {}
    else:
        assert chunked_array.num_chunks == 1

        # Assume type is text. (We checked column.type is ColumnType.Text above.)
        chunk = chunked_array.chunks[0]
        if not hasattr(chunk, "dictionary"):
            chunk = chunk.dictionary_encode()

        try:
            max_index = max(v.as_py() for v in chunk.indices
                            if v is not pa.NULL)
        except ValueError:
            # all nulls. Hack with "-1" makes the algorithm not-crash.
            max_index = -1

        counts = np.zeros(max_index + 1,
                          dtype=int)  # if max_index = -1, counts is empty
        for v in chunk.indices:
            if v is not pa.NULL:
                counts[v.as_py()] += 1
        value_counts = {
            value.as_py(): int(count)
            for value, count in zip(chunk.dictionary, counts)
        }

    return JsonResponse({"values": value_counts})
示例#11
0
async def result_column_value_counts(
    request: HttpRequest,
    workflow_id: int,
    step_slug: str,
    delta_id: int,
) -> JsonResponse:
    try:
        colname = request.GET["column"]
    except KeyError:
        return JsonResponse({"error": 'Missing a "column" parameter'},
                            status=status.BAD_REQUEST)

    if not colname:
        # User has not yet chosen a column. Empty response.
        return JsonResponse({"values": {}})

    # raise Http404, PermissionDenied
    _, step = await _load_workflow_and_step(request, workflow_id, step_slug,
                                            "all")
    cached_result = step.cached_render_result
    if cached_result is None or cached_result.delta_id != delta_id:
        # assume we'll get another request after execute finishes
        return JsonResponse({"values": {}})

    try:
        column_index, column = next(
            (i, c) for i, c in enumerate(cached_result.table_metadata.columns)
            if c.name == colname)
    except StopIteration:
        return JsonResponse({"error": f'column "{colname}" not found'},
                            status=status.NOT_FOUND)

    if not isinstance(column.type, ColumnType.Text):
        # We only return text values.
        #
        # Rationale: this is only used in Refine and Filter by Value. Both
        # force text. The user can query a column before it's converted to
        # text; but if he/she does, we shouldn't format as text unless we have
        # a viable workflow that needs it. (Better would be to force the user
        # to convert to text before doing anything else, no?)
        return JsonResponse({"values": {}})

    try:
        # raise CorruptCacheError
        with open_cached_render_result(cached_result) as result:
            arrow_table = result.table.table
            chunked_array = arrow_table.column(column_index)
    except CorruptCacheError:
        # We _could_ return an empty result set; but our only goal here is
        # "don't crash" and this 404 seems to be the simplest implementation.
        # (We assume that if the data is deleted, the user has moved elsewhere
        # and this response is going to be ignored.)
        return JsonResponse({"error": f'column "{colname}" not found'},
                            status=status.NOT_FOUND)

    if chunked_array.num_chunks == 0:
        value_counts = {}
    else:
        pyarrow_value_counts = chunked_array.value_counts()
        # Assume type is text. (We checked column.type is ColumnType.Text above.)
        #
        # values can be either a StringArray or a DictionaryArray. In either case,
        # .to_pylist() converts to a Python List[str].
        values = pyarrow_value_counts.field("values").to_pylist()
        counts = pyarrow_value_counts.field("counts").to_pylist()

        value_counts = {v: c for v, c in zip(values, counts) if v is not None}

    response = JsonResponse({"values": value_counts})
    patch_response_headers(response, cache_timeout=600)
    return response
示例#12
0
    def test_execute_mark_unreachable(self, send_update, fake_load_module):
        future_none = asyncio.Future()
        future_none.set_result(None)
        send_update.return_value = future_none

        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta_id = workflow.last_delta_id
        ModuleVersion.create_or_replace_from_spec({
            "id_name": "mod",
            "name": "Mod",
            "category": "Clean",
            "parameters": []
        })
        wf_module1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module3 = tab.wf_modules.create(
            order=2,
            slug="step-3",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )

        fake_module = Mock(LoadedModule)
        fake_load_module.return_value = fake_module
        fake_module.migrate_params.return_value = {}
        error_result = RenderResult(
            errors=[RenderError(I18nMessage.TODO_i18n("error, not warning"))])
        fake_module.render.return_value = error_result

        self._execute(workflow)

        wf_module1.refresh_from_db()
        self.assertEqual(wf_module1.cached_render_result.status, "error")
        with open_cached_render_result(
                wf_module1.cached_render_result) as result:
            assert_render_result_equals(result, error_result)

        wf_module2.refresh_from_db()
        self.assertEqual(wf_module2.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module2.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        wf_module3.refresh_from_db()
        self.assertEqual(wf_module3.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module3.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        send_update.assert_called_with(
            workflow.id,
            clientside.Update(
                steps={
                    wf_module3.id:
                    clientside.StepUpdate(
                        render_result=wf_module3.cached_render_result)
                }),
        )
示例#13
0
    def test_execute_mark_unreachable(self, send_update):
        future_none = asyncio.Future()
        future_none.set_result(None)
        send_update.return_value = future_none

        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta_id = workflow.last_delta_id
        create_module_zipfile(
            "mod",
            python_code='def render(table, params): return "error, not warning"'
        )
        wf_module1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module3 = tab.wf_modules.create(
            order=2,
            slug="step-3",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )

        error_result = RenderResult(
            errors=[RenderError(I18nMessage.TODO_i18n("error, not warning"))])

        self._execute(workflow)

        wf_module1.refresh_from_db()
        self.assertEqual(wf_module1.cached_render_result.status, "error")
        with open_cached_render_result(
                wf_module1.cached_render_result) as result:
            assert_render_result_equals(result, error_result)

        wf_module2.refresh_from_db()
        self.assertEqual(wf_module2.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module2.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        wf_module3.refresh_from_db()
        self.assertEqual(wf_module3.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module3.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        send_update.assert_called_with(
            workflow.id,
            clientside.Update(
                steps={
                    wf_module3.id:
                    clientside.StepUpdate(
                        render_result=wf_module3.cached_render_result,
                        module_slug="mod")
                }),
        )
示例#14
0
    def test_execute_mark_unreachable(self, send_delta_async,
                                      fake_load_module):
        send_delta_async.return_value = future_none

        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        delta_id = workflow.last_delta_id
        ModuleVersion.create_or_replace_from_spec({
            "id_name": "mod",
            "name": "Mod",
            "category": "Clean",
            "parameters": []
        })
        wf_module1 = tab.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module2 = tab.wf_modules.create(
            order=1,
            slug="step-2",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )
        wf_module3 = tab.wf_modules.create(
            order=2,
            slug="step-3",
            last_relevant_delta_id=delta_id,
            module_id_name="mod",
        )

        fake_module = Mock(LoadedModule)
        fake_load_module.return_value = fake_module
        fake_module.migrate_params.return_value = {}
        error_result = RenderResult(
            errors=[RenderError(I18nMessage.TODO_i18n("error, not warning"))])
        fake_module.render.return_value = error_result

        self._execute(workflow)

        wf_module1.refresh_from_db()
        self.assertEqual(wf_module1.cached_render_result.status, "error")
        with open_cached_render_result(
                wf_module1.cached_render_result) as result:
            assert_render_result_equals(result, error_result)

        wf_module2.refresh_from_db()
        self.assertEqual(wf_module2.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module2.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        wf_module3.refresh_from_db()
        self.assertEqual(wf_module3.cached_render_result.status, "unreachable")
        with open_cached_render_result(
                wf_module3.cached_render_result) as result:
            assert_render_result_equals(result, RenderResult())

        send_delta_async.assert_called_with(
            workflow.id,
            {
                "updateWfModules": {
                    str(wf_module3.id): {
                        "output_status": "unreachable",
                        "quick_fixes": [],
                        "output_error": "",
                        "output_columns": [],
                        "output_n_rows": 0,
                        "cached_render_result_delta_id": delta_id,
                    }
                }
            },
        )