def test_arrow_all_null_text_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array(["a", "b", None, "c"])}, columns=[atypes.Column("A", ColumnType.Text())], )) assert_frame_equal(dataframe, pd.DataFrame({"A": ["a", "b", np.nan, "c"]})) self.assertEqual(columns, [Column("A", ColumnType.Text())])
def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.Number()), Column("B", ColumnType.Text())], )
def test_coerce_infer_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce(table) self.assertEqual( result.columns, [Column("A", ColumnType.Number()), Column("B", ColumnType.Text())], )
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.Text())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
def test_columns(self): df = pd.DataFrame({ "A": [1], # number "B": ["foo"], # str "C": dt(2018, 8, 20), # datetime64 }) df["D"] = pd.Series(["cat"], dtype="category") result = ProcessResult(df) self.assertEqual(result.column_names, ["A", "B", "C", "D"]) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Text()), ], )
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.Number(format="{:,d}")), Column("B", ColumnType.Text()), ], )
def test_dataframe_all_null_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([None], pa.int8()), pa.array([], pa.string()), ), } ), )
def test_dataframe_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([0, 1, None, 0], pa.int8()), pa.array(["A", "B"], pa.string()), ), } ), )
def test_arrow_category_column(self): atable = arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.Text())]) assert_frame_equal( dataframe, pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"))
def test_coerce_infer_columns_with_unit(self): table = pd.DataFrame( {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]} ) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "year"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.Date(unit="year")), Column("B", ColumnType.Text()), ], )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame({ "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], })) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), ], )
def test_arrow_all_null_category_column(self): atable = arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) }) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.Text())]) assert_frame_equal( dataframe, pd.DataFrame({ "A": [None] }, dtype=str).astype("category"))
def test_dataframe_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], self.path, ), arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }), )
def test_dataframe_all_null_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({ "A": [None] }, dtype=str).astype("category"), [Column("A", ColumnType.Text())], self.path, ), arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) }), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], "D": [pd.Period("2021-01-01", freq="D"), pd.NaT], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Date("day")), ], )
def test_dataframe_all_null_text_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.Text())], expected_table=make_table(make_column("A", [None], pa.string())), )