def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_parquet( p.uri, **{ "engine": "pyarrow", "schema": df.schema.pa_schema, **kwargs })
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred :param schema: Avro Schema determines dtypes saved """ import pandavro as pdx kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] del kw["schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def transform(self, df: LocalDataFrame) -> LocalDataFrame: assert 1 == self.on_init_called assert "test" in self.workflow_conf assert "x" in df.metadata pdf = df.as_pandas() pdf["p"] = self.params.get("p", 1) pdf["ct"] = pdf.shape[0] return PandasDataFrame(pdf, self.output_schema)
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame: self.transformer._cursor = cursor # type: ignore df._metadata = self.metadata try: to_local_bounded_df(self.transformer.transform(df)) return ArrayDataFrame([], self.transformer.output_schema) except self.ignore_errors: # type: ignore return ArrayDataFrame([], self.transformer.output_schema)
def _save_avro(df: LocalDataFrame, p: FileParser, columns: Any = None, **kwargs: Any): """Save pandas dataframe as avro. If providing your own schema, the usage of schema argument is preferred """ kw = ParamDict(kwargs) # pandavro defaults schema = None append = False times_as_micros = True # pandavro defaults schema = None append = False times_as_micros = True if "schema" in kw: schema = kw["schema"] if schema is None: if columns is not None: schema = _convert_pyarrow_to_avro_schema(df, columns) else: if columns: # both schema and columns provided raise Exception("set columns to None when schema is provided") del kw["infer_schema"] if "infer_schema" in kw: infer_schema = kw["infer_schema"] if infer_schema and (schema is not None): # infer_schema set to True but schema was provided raise Exception( "set infer_schema to False when schema is provided") del kw["infer_schema"] if "append" in kw: append = kw[ "append"] # default is overwrite (False) instead of append (True) del kw["append"] if "times_as_micros" in kw: times_as_micros = kw["times_as_micros"] del kw["times_as_micros"] pdf = df.as_pandas() pdx.to_avro(p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw)
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame: self.transformer._cursor = cursor # type: ignore df._metadata = self.metadata if len(self.ignore_errors) == 0: return self.transformer.transform(df) else: try: return to_local_bounded_df(self.transformer.transform(df)) except self.ignore_errors: # type: ignore # pylint: disable=E0712 return ArrayDataFrame([], self.transformer.output_schema)
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_json(p.uri, **{ "orient": "records", "lines": True, **kwargs })
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_csv(p.uri, **{"index": False, "header": False, **kwargs})
def f26(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[Dict[str, Any]]: e = list(PandasDataFrame(e).as_array()) e += list(a.as_array()) return ArrayDataFrame(e, "a:int").as_dict_iterable()
def f25(e: DataFrame, a: LocalDataFrame) -> List[Dict[str, Any]]: e = e.as_array() e += list(a.as_array()) return list(ArrayDataFrame(e, "a:int").as_dict_iterable())
def count(self, df: LocalDataFrame) -> int: if df.is_bounded: return df.count() else: return sum(1 for _ in df.as_array_iterable())
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None: df.as_pandas().to_json(p.uri, **kwargs)