def render(arrow_table, params, output_path, *, fetch_result, **kwargs): # Must perform header operation here in the event the header checkbox # state changes if fetch_result is None: # empty table return [] elif fetch_result.path is not None and cjwparquet.file_has_parquet_magic_number( fetch_result.path): # Deprecated files: we used to parse in fetch() and store the result # as Parquet. Now we've lost the original file data, and we need to # support our oldest users. # # In this deprecated format, parse errors were written as # fetch_result.errors. return _render_deprecated_parquet( fetch_result.path, [tuple(e.message) for e in fetch_result.errors], output_path, params, ) elif fetch_result.errors: # We've never stored errors+data. If there are errors, assume # there's no data. # # We've never stored errors with quick-fixes return [tuple(e.message) for e in fetch_result.errors] else: assert not fetch_result.errors # we've never stored errors+data. return _render_file(fetch_result.path, params, output_path)
def get_accumulatable_api_endpoint_and_params( self) -> Optional[Dict[str, str]]: """Return (endpoint, params) that generated this FetchResultFile. Return None if we cannot know or if there never was one. """ if file_has_parquet_magic_number(self.path): return None with tarfile.open(self.path, mode="r") as tf: ti = tf.firstmember if ti is None: return None # This can _only_ be a ERROR.json.lz4 or [id].json.lz4 file. # A tarfile is either empty, or it contains a file _before_ # LEGACY.parquet. api_endpoint = ti.pax_headers["cjw:apiEndpoint"] api_params = ti.pax_headers["cjw:apiParams"] return api_endpoint, { key: values[0] for key, values in urllib.parse.parse_qs(api_params).items() if key not in { "expansions", "tweet.fields", "user.fields", "max_results", "count", "next_token", "since_id", "include_entities", "tweet_mode", } }
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def get_error_result_part(self) -> Optional[ResultPart]: """Return the last file, if it's error.""" if file_has_parquet_magic_number(self.path): return None with tarfile.open(self.path, "r") as tf: ti = tf.firstmember if ti is not None and "ERROR" in ti.name: # Read the body into RAM. When we return, we'll close the tarfile # so any fileobject within it would be invalid. body = tarfile.TarFile.fileobject(tf, ti).read() return ResultPart.for_tarinfo(ti, body) return None
def get_max_tweet_id(self) -> Optional[int]: """Calculate the maximum tweet ID throughout the fetched file. Return None if no tweets have been fetched. """ if file_has_parquet_magic_number(self.path): return _parquet_file_to_max_tweet_id(self.path) with tarfile.open(self.path, mode="r") as tf: for ti in tf: if ti.name.endswith(".json.lz4") and "ERROR" not in ti.name: return int(ti.name[:-9]) if ti.name == "LEGACY.parquet": with tarfile.TarFile.fileobject(tf, ti) as f: return _parquet_file_to_max_tweet_id(f) return None # No tweets
def get_result_parts(self) -> Iterable[ResultPart]: """Iterate over ResultParts. May be interrupted. This is a generator so RAM usage is at a minimum. We only load one file into RAM at a time. """ if file_has_parquet_magic_number(self.path): # If the file has no rows, then don't yield it. This way, we'll # never write an empty LEGACY.parquet into a v1 file. is_empty = _parquet_file_to_max_tweet_id(self.path) is None if not is_empty: # Assume the Parquet file is fairly small, so it fits in RAM yield ResultPart.for_parquet_bytes(self.path.read_bytes()) return with tarfile.open(self.path, "r") as tf: for ti in tf: yield ResultPart.for_tarinfo( ti, tarfile.TarFile.fileobject(tf, ti).read())
def render(arrow_table, params, output_path, *, fetch_result, **kwargs): if fetch_result is None: # empty table output_path.write_bytes(b"") return [] elif fetch_result.path is not None and cjwparquet.file_has_parquet_magic_number( fetch_result.path): # Deprecated files: we used to parse in fetch() and store the result # as Parquet. Now we've lost the original file data, and we need to # support our oldest users. return _render_deprecated_parquet( fetch_result.path, [tuple(e.message) for e in fetch_result.errors], output_path, params, ) elif fetch_result.errors: # We've never stored errors+data. If there are errors, assume # there's no data. output_path.write_bytes(b"") return [tuple(e.message) for e in fetch_result.errors] else: assert not fetch_result.errors # we've never stored errors+data. return _render_file(fetch_result.path, output_path, params)
def __render_pandas( *, table: types.ArrowTable, params: Dict[str, Any], tab_name: str, fetch_result: Optional[types.FetchResult], output_path: Path, ) -> types.RenderResult: """ Call `render()` with the Pandas signature style. Features: * Convert input Arrow table to a Pandas dataframe * Convert input params to Pandas format (producing extra arguments like `input_tabs` as needed). * Convert input `fetch_result` to Pandas dataframe, if it is a valid Parquet file. * Coerce output from a Pandas dataframe to an Arrow table * Coerce output errors/json """ # Convert input arguments pandas_table = __arrow_to_pandas(table) pandas_params = __arrow_param_to_pandas_param(params) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if fetch_result is not None: if (fetch_result.path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result.path)): fetched_table = __parquet_to_pandas(fetch_result.path) pandas_fetch_result = ptypes.ProcessResult( fetched_table, fetch_result.errors) else: pandas_fetch_result = fetch_result else: pandas_fetch_result = None kwargs["fetch_result"] = pandas_fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = { c.name: ptypes.RenderColumn(c.name, c.type.name, getattr(c.type, "format", None)) for c in table.metadata.columns } if varkw or "input_tabs" in kwonlyargs: kwargs["input_tabs"] = { to.tab.slug: __arrow_tab_output_to_pandas(to) for to in __find_tab_outputs(params) } # call render() raw_result = render(pandas_table, pandas_params, **kwargs) # Coerce outputs result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=table.metadata.columns ) # raise ValueError if invalid result.truncate_in_place_if_too_big() return result.to_arrow(output_path)
def test_empty_parquet_file(self): with parquet_file({}) as path: self.assertTrue(cjwparquet.file_has_parquet_magic_number(path))
def test_good_magic_numbers_but_too_short_to_be_parquet(self): # Parquet has PAR1 at the beginning and end. But the file "PAR1" on its # own is not a Parquet file. with tempfile_context() as path: path.write_bytes(b"PAR1") self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
def test_very_short_file(self): with tempfile_context() as path: path.write_bytes(b"PAR") self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
def test_empty_file(self): with tempfile_context() as path: self.assertFalse(cjwparquet.file_has_parquet_magic_number(path))
def test_parquet_file_has_magic_numbers(self): with parquet_file({"A": [1]}) as path: self.assertTrue(cjwparquet.file_has_parquet_magic_number(path))