示例#1
0
def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_parquet(
        p.uri, **{
            "engine": "pyarrow",
            "schema": df.schema.pa_schema,
            **kwargs
        })
示例#2
0
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    :param schema: Avro Schema determines dtypes saved
    """
    import pandavro as pdx

    kw = ParamDict(kwargs)

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        del kw["schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
示例#3
0
 def transform(self, df: LocalDataFrame) -> LocalDataFrame:
     assert 1 == self.on_init_called
     assert "test" in self.workflow_conf
     assert "x" in df.metadata
     pdf = df.as_pandas()
     pdf["p"] = self.params.get("p", 1)
     pdf["ct"] = pdf.shape[0]
     return PandasDataFrame(pdf, self.output_schema)
示例#4
0
 def run(self, cursor: PartitionCursor,
         df: LocalDataFrame) -> LocalDataFrame:
     self.transformer._cursor = cursor  # type: ignore
     df._metadata = self.metadata
     try:
         to_local_bounded_df(self.transformer.transform(df))
         return ArrayDataFrame([], self.transformer.output_schema)
     except self.ignore_errors:  # type: ignore
         return ArrayDataFrame([], self.transformer.output_schema)
示例#5
0
def _save_avro(df: LocalDataFrame,
               p: FileParser,
               columns: Any = None,
               **kwargs: Any):
    """Save pandas dataframe as avro.
    If providing your own schema, the usage of schema argument is preferred

    """

    kw = ParamDict(kwargs)
    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    # pandavro defaults
    schema = None
    append = False
    times_as_micros = True

    if "schema" in kw:
        schema = kw["schema"]
        if schema is None:
            if columns is not None:
                schema = _convert_pyarrow_to_avro_schema(df, columns)
        else:
            if columns:
                # both schema and columns provided
                raise Exception("set columns to None when schema is provided")

        del kw["infer_schema"]

    if "infer_schema" in kw:
        infer_schema = kw["infer_schema"]
        if infer_schema and (schema is not None):
            # infer_schema set to True but schema was provided
            raise Exception(
                "set infer_schema to False when schema is provided")
        del kw["infer_schema"]

    if "append" in kw:
        append = kw[
            "append"]  # default is overwrite (False) instead of append (True)
        del kw["append"]

    if "times_as_micros" in kw:
        times_as_micros = kw["times_as_micros"]
        del kw["times_as_micros"]

    pdf = df.as_pandas()
    pdx.to_avro(p.uri,
                pdf,
                schema=schema,
                append=append,
                times_as_micros=times_as_micros,
                **kw)
示例#6
0
 def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
     self.transformer._cursor = cursor  # type: ignore
     df._metadata = self.metadata
     if len(self.ignore_errors) == 0:
         return self.transformer.transform(df)
     else:
         try:
             return to_local_bounded_df(self.transformer.transform(df))
         except self.ignore_errors:  # type: ignore  # pylint: disable=E0712
             return ArrayDataFrame([], self.transformer.output_schema)
示例#7
0
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_json(p.uri, **{
        "orient": "records",
        "lines": True,
        **kwargs
    })
示例#8
0
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_csv(p.uri, **{"index": False, "header": False, **kwargs})
示例#9
0
def f26(e: pd.DataFrame, a: LocalDataFrame) -> Iterable[Dict[str, Any]]:
    e = list(PandasDataFrame(e).as_array())
    e += list(a.as_array())
    return ArrayDataFrame(e, "a:int").as_dict_iterable()
示例#10
0
def f25(e: DataFrame, a: LocalDataFrame) -> List[Dict[str, Any]]:
    e = e.as_array()
    e += list(a.as_array())
    return list(ArrayDataFrame(e, "a:int").as_dict_iterable())
示例#11
0
 def count(self, df: LocalDataFrame) -> int:
     if df.is_bounded:
         return df.count()
     else:
         return sum(1 for _ in df.as_array_iterable())
示例#12
0
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
    df.as_pandas().to_json(p.uri, **kwargs)