Exemplo n.º 1
0
def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_parquet(
        p.uri, **{
            "engine": "pyarrow",
            "schema": df.schema.pa_schema,
            **kwargs
        })
Exemplo n.º 2
0
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    :param schema: Avro Schema determines dtypes saved
    """
    import pandavro as pdx

    kw = ParamDict(kwargs)

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        del kw["schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
Exemplo n.º 3
0
 def transform(self, df: LocalDataFrame) -> LocalDataFrame:
     assert 1 == self.on_init_called
     assert "test" in self.workflow_conf
     assert "x" in df.metadata
     pdf = df.as_pandas()
     pdf["p"] = self.params.get("p", 1)
     pdf["ct"] = pdf.shape[0]
     return PandasDataFrame(pdf, self.output_schema)
Exemplo n.º 4
0
def _save_avro(df: LocalDataFrame,
               p: FileParser,
               columns: Any = None,
               **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    """

    kw = ParamDict(kwargs)
    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        if schema is None:
            if columns is not None:
                schema = _convert_pyarrow_to_avro_schema(df, columns)
        else:
            if columns:
                # both schema and columns provided
                raise Exception("set columns to None when schema is provided")

        del kw["infer_schema"]

    if "infer_schema" in kw:
        infer_schema = kw["infer_schema"]
        if infer_schema and (schema is not None):
            # infer_schema set to True but schema was provided
            raise Exception(
                "set infer_schema to False when schema is provided")
        del kw["infer_schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
Exemplo n.º 5
0
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_json(p.uri, **{
        "orient": "records",
        "lines": True,
        **kwargs
    })
Exemplo n.º 6
0
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_csv(p.uri, **{"index": False, "header": False, **kwargs})
Exemplo n.º 7
0
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_json(p.uri, **kwargs)