Exemplo n.º 1
0
def render(arrow_table, params, output_path, *, fetch_result, **kwargs):
    # Must perform header operation here in the event the header checkbox
    # state changes
    if fetch_result is None:
        # empty table
        return []
    elif fetch_result.path is not None and cjwparquet.file_has_parquet_magic_number(
            fetch_result.path):
        # Deprecated files: we used to parse in fetch() and store the result
        # as Parquet. Now we've lost the original file data, and we need to
        # support our oldest users.
        #
        # In this deprecated format, parse errors were written as
        # fetch_result.errors.
        return _render_deprecated_parquet(
            fetch_result.path,
            [tuple(e.message) for e in fetch_result.errors],
            output_path,
            params,
        )
    elif fetch_result.errors:
        # We've never stored errors+data. If there are errors, assume
        # there's no data.
        #
        # We've never stored errors with quick-fixes
        return [tuple(e.message) for e in fetch_result.errors]
    else:
        assert not fetch_result.errors  # we've never stored errors+data.
        return _render_file(fetch_result.path, params, output_path)
Exemplo n.º 2
0
    def get_accumulatable_api_endpoint_and_params(
            self) -> Optional[Dict[str, str]]:
        """Return (endpoint, params) that generated this FetchResultFile.

        Return None if we cannot know or if there never was one.
        """
        if file_has_parquet_magic_number(self.path):
            return None

        with tarfile.open(self.path, mode="r") as tf:
            ti = tf.firstmember
            if ti is None:
                return None

            # This can _only_ be a ERROR.json.lz4 or [id].json.lz4 file.
            # A tarfile is either empty, or it contains a file _before_
            # LEGACY.parquet.
            api_endpoint = ti.pax_headers["cjw:apiEndpoint"]
            api_params = ti.pax_headers["cjw:apiParams"]
            return api_endpoint, {
                key: values[0]
                for key, values in urllib.parse.parse_qs(api_params).items()
                if key not in {
                    "expansions",
                    "tweet.fields",
                    "user.fields",
                    "max_results",
                    "count",
                    "next_token",
                    "since_id",
                    "include_entities",
                    "tweet_mode",
                }
            }
Exemplo n.º 3
0
def call_render(module_spec: ModuleSpec, render: Callable,
                request: ttypes.RenderRequest) -> ttypes.RenderResult:
    basedir = Path(request.basedir)
    input_path = basedir / request.input_filename
    table = load_trusted_arrow_file(input_path)
    dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table)
    tab_outputs = {
        k: _thrift_tab_output_to_pandas(v, basedir)
        for k, v in request.tab_outputs.items()
    }
    params = _prepare_params(module_spec,
                             thrift_json_object_to_pydict(request.params),
                             basedir, tab_outputs)
    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if request.fetch_result is None:
            fetch_result = None
        else:
            fetch_result_path = basedir / request.fetch_result.filename
            errors = [
                # Data comes in as FetchError and we return RenderError.
                RenderError(thrift_i18n_message_to_arrow(e.message))
                for e in request.fetch_result.errors
            ]
            if (fetch_result_path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result_path)):
                fetch_result = ptypes.ProcessResult(
                    dataframe=_parquet_to_pandas(fetch_result_path),
                    errors=errors,
                    # infer columns -- the fetch interface doesn't handle formats
                    # (TODO nix pandas_v0 fetching altogether by rewriting all modules)
                )
            else:
                # TODO nix pandas Fetch modules. (Do any use files, even?)
                fetch_result = types.FetchResult(path=fetch_result_path,
                                                 errors=errors)
        kwargs["fetch_result"] = fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = request.tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema)

    input_columns = read_columns(table, full=False)
    raw_result = render(dataframe, params, **kwargs)

    # raise ValueError if invalid
    pandas_result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=input_columns)
    pandas_result.truncate_in_place_if_too_big()

    arrow_result = pandas_result.to_arrow(basedir / request.output_filename)
    return arrow_render_result_to_thrift(arrow_result)
Exemplo n.º 4
0
    def get_error_result_part(self) -> Optional[ResultPart]:
        """Return the last file, if it's error."""
        if file_has_parquet_magic_number(self.path):
            return None

        with tarfile.open(self.path, "r") as tf:
            ti = tf.firstmember
            if ti is not None and "ERROR" in ti.name:
                # Read the body into RAM. When we return, we'll close the tarfile
                # so any fileobject within it would be invalid.
                body = tarfile.TarFile.fileobject(tf, ti).read()
                return ResultPart.for_tarinfo(ti, body)
        return None
Exemplo n.º 5
0
    def get_max_tweet_id(self) -> Optional[int]:
        """Calculate the maximum tweet ID throughout the fetched file.

        Return None if no tweets have been fetched.
        """
        if file_has_parquet_magic_number(self.path):
            return _parquet_file_to_max_tweet_id(self.path)

        with tarfile.open(self.path, mode="r") as tf:
            for ti in tf:
                if ti.name.endswith(".json.lz4") and "ERROR" not in ti.name:
                    return int(ti.name[:-9])
                if ti.name == "LEGACY.parquet":
                    with tarfile.TarFile.fileobject(tf, ti) as f:
                        return _parquet_file_to_max_tweet_id(f)

        return None  # No tweets
Exemplo n.º 6
0
    def get_result_parts(self) -> Iterable[ResultPart]:
        """Iterate over ResultParts. May be interrupted.

        This is a generator so RAM usage is at a minimum. We only load one file
        into RAM at a time.
        """
        if file_has_parquet_magic_number(self.path):
            # If the file has no rows, then don't yield it. This way, we'll
            # never write an empty LEGACY.parquet into a v1 file.
            is_empty = _parquet_file_to_max_tweet_id(self.path) is None
            if not is_empty:
                # Assume the Parquet file is fairly small, so it fits in RAM
                yield ResultPart.for_parquet_bytes(self.path.read_bytes())
            return

        with tarfile.open(self.path, "r") as tf:
            for ti in tf:
                yield ResultPart.for_tarinfo(
                    ti,
                    tarfile.TarFile.fileobject(tf, ti).read())
Exemplo n.º 7
0
def render(arrow_table, params, output_path, *, fetch_result, **kwargs):
    if fetch_result is None:
        # empty table
        output_path.write_bytes(b"")
        return []
    elif fetch_result.path is not None and cjwparquet.file_has_parquet_magic_number(
            fetch_result.path):
        # Deprecated files: we used to parse in fetch() and store the result
        # as Parquet. Now we've lost the original file data, and we need to
        # support our oldest users.
        return _render_deprecated_parquet(
            fetch_result.path,
            [tuple(e.message) for e in fetch_result.errors],
            output_path,
            params,
        )
    elif fetch_result.errors:
        # We've never stored errors+data. If there are errors, assume
        # there's no data.
        output_path.write_bytes(b"")
        return [tuple(e.message) for e in fetch_result.errors]
    else:
        assert not fetch_result.errors  # we've never stored errors+data.
        return _render_file(fetch_result.path, output_path, params)
Exemplo n.º 8
0
def __render_pandas(
    *,
    table: types.ArrowTable,
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[types.FetchResult],
    output_path: Path,
) -> types.RenderResult:
    """
    Call `render()` with the Pandas signature style.

    Features:

    * Convert input Arrow table to a Pandas dataframe
    * Convert input params to Pandas format (producing extra arguments like
      `input_tabs` as needed).
    * Convert input `fetch_result` to Pandas dataframe, if it is a valid
      Parquet file.
    * Coerce output from a Pandas dataframe to an Arrow table
    * Coerce output errors/json
    """
    # Convert input arguments
    pandas_table = __arrow_to_pandas(table)
    pandas_params = __arrow_param_to_pandas_param(params)

    spec = inspect.getfullargspec(render)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs
    if varkw or "fetch_result" in kwonlyargs:
        if fetch_result is not None:
            if (fetch_result.path.stat().st_size == 0
                    or cjwparquet.file_has_parquet_magic_number(
                        fetch_result.path)):
                fetched_table = __parquet_to_pandas(fetch_result.path)
                pandas_fetch_result = ptypes.ProcessResult(
                    fetched_table, fetch_result.errors)
            else:
                pandas_fetch_result = fetch_result
        else:
            pandas_fetch_result = None
        kwargs["fetch_result"] = pandas_fetch_result
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "tab_name" in kwonlyargs:
        kwargs["tab_name"] = tab_name
    if varkw or "input_columns" in kwonlyargs:
        kwargs["input_columns"] = {
            c.name: ptypes.RenderColumn(c.name, c.type.name,
                                        getattr(c.type, "format", None))
            for c in table.metadata.columns
        }
    if varkw or "input_tabs" in kwonlyargs:
        kwargs["input_tabs"] = {
            to.tab.slug: __arrow_tab_output_to_pandas(to)
            for to in __find_tab_outputs(params)
        }

    # call render()
    raw_result = render(pandas_table, pandas_params, **kwargs)

    # Coerce outputs
    result = ptypes.ProcessResult.coerce(
        raw_result, try_fallback_columns=table.metadata.columns
    )  # raise ValueError if invalid
    result.truncate_in_place_if_too_big()

    return result.to_arrow(output_path)
Exemplo n.º 9
0
 def test_empty_parquet_file(self):
     with parquet_file({}) as path:
         self.assertTrue(cjwparquet.file_has_parquet_magic_number(path))
Exemplo n.º 10
0
 def test_good_magic_numbers_but_too_short_to_be_parquet(self):
     # Parquet has PAR1 at the beginning and end. But the file "PAR1" on its
     # own is not a Parquet file.
     with tempfile_context() as path:
         path.write_bytes(b"PAR1")
         self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
Exemplo n.º 11
0
 def test_very_short_file(self):
     with tempfile_context() as path:
         path.write_bytes(b"PAR")
         self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
Exemplo n.º 12
0
 def test_empty_file(self):
     with tempfile_context() as path:
         self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
Exemplo n.º 13
0
 def test_parquet_file_has_magic_numbers(self):
     with parquet_file({"A": [1]}) as path:
         self.assertTrue(cjwparquet.file_has_parquet_magic_number(path))