def _get_altered_schema(self, subschema: Any) -> Schema: sub = Schema(subschema) assert_or_throw( sub.names in self.schema, lambda: FugueDataFrameOperationError( f"{sub.names} are not all in {self.schema}" ), ) for k, v in sub.items(): old_type = self.schema[k].type new_type = v.type if not old_type.equals(new_type): assert_or_throw( not pa.types.is_struct(old_type) and not pa.types.is_list(old_type) and not pa.types.is_binary(old_type), lambda: NotImplementedError(f"can't convert from {old_type}"), ) assert_or_throw( not pa.types.is_struct(new_type) and not pa.types.is_list(new_type) and not pa.types.is_binary(new_type), lambda: NotImplementedError(f"can't convert to {new_type}"), ) return Schema([(k, sub.get(k, v)) for k, v in self.schema.items()])
def _enforce_type(df: pd.DataFrame, schema: Schema) -> pd.DataFrame: # TODO: does this have higher latency? for k, v in schema.items(): s = df[k] if pa.types.is_string(v.type): ns = s.isnull() s = s.astype(str) s[ns] = None elif pa.types.is_integer(v.type) or pa.types.is_boolean(v.type): ns = s.isnull() s = s.fillna(0).astype(v.type.to_pandas_dtype()) s[ns] = None elif not pa.types.is_struct(v.type) and not pa.types.is_list(v.type): s = s.astype(v.type.to_pandas_dtype()) df[k] = s return df