def _apply_schema(self, pdf: pd.DataFrame, schema: Optional[Schema]) -> Tuple[pd.DataFrame, Schema]: PD_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = _input_schema(pdf) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError( f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None, pandas_df_wrapper: bool = False, ): try: apply_schema = True if df is None: schema = _input_schema(schema).assert_not_empty() df = [] if isinstance(df, PandasDataFrame): # TODO: This is useless if in this way and wrong pdf = df.native schema = None elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) if pandas_df_wrapper and schema is not None: apply_schema = False elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() pdf = pd.DataFrame(df, columns=schema.names) pdf = PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True) if PD_UTILS.empty(pdf): for k, v in schema.items(): pdf[k] = pdf[k].astype(v.type.to_pandas_dtype()) apply_schema = False else: raise ValueError(f"{df} is incompatible with PandasDataFrame") if apply_schema: pdf, schema = self._apply_schema(pdf, schema) super().__init__(schema, metadata) self._native = pdf except Exception as e: raise FugueDataFrameInitError from e
def as_pandas(self) -> pd.DataFrame: """Convert to pandas DataFrame""" pdf = pd.DataFrame(self.as_array(), columns=self.schema.names) return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
def __init__(self, data, schema, enforce=False): s = expression_to_schema(schema) df = pd.DataFrame(data, columns=s.names) self.native = PD_UTILS.enforce_type(df, s, enforce) self.schema = s