def test_from_pandas_datetime(): df = pd.DataFrame({ "datetime": ["2021-01-01", "2021-01-02"], "foo": [1, 2] }) df["datetime"] = pd.to_datetime(df["datetime"]) pl.from_pandas(df)
def test_from_pandas_dataframe() -> None: pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) df = pl.from_pandas(pd_df) assert df.shape == (2, 3) # if not a pandas dataframe, raise a ValueError with pytest.raises(ValueError): _ = pl.from_pandas([1, 2]) # type: ignore
def test_from_pandas_nan_to_none() -> None: from pyarrow import ArrowInvalid df = pd.Series([2, np.nan, None], name="pd") out_true = pl.from_pandas(df) out_false = pl.from_pandas(df, nan_to_none=False) df.loc[2] = pd.NA assert [val is None for val in out_true] assert [np.isnan(val) for val in out_false[1:]] with pytest.raises(ArrowInvalid, match="Could not convert"): pl.from_pandas(df, nan_to_none=False)
def test_from_optional_not_available() -> None: with patch("polars.convert._NUMPY_AVAILABLE", False): with pytest.raises(ImportError): pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"]) with patch("polars.convert._PYARROW_AVAILABLE", False): with pytest.raises(ImportError): pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})) with pytest.raises(ImportError): pl.from_pandas(pd.Series([1, 2, 3])) with patch("polars.convert._PANDAS_AVAILABLE", False): with pytest.raises(ImportError): pl.from_pandas(pd.Series([1, 2, 3]))
def test_from_pandas_nan_to_none() -> None: from pyarrow import ArrowInvalid df = pd.DataFrame({ "bools_nulls": [None, True, False], "int_nulls": [1, None, 3], "floats_nulls": [1.0, None, 3.0], "strings_nulls": ["foo", None, "ham"], "nulls": [None, np.nan, np.nan], }) out_true = pl.from_pandas(df) out_false = pl.from_pandas(df, nan_to_none=False) df.loc[2, "nulls"] = pd.NA assert all(val is None for val in out_true["nulls"]) assert all(np.isnan(val) for val in out_false["nulls"][1:]) with pytest.raises(ArrowInvalid, match="Could not convert"): pl.from_pandas(df, nan_to_none=False) df = pd.Series([2, np.nan, None], name="pd") # type: ignore out_true = pl.from_pandas(df) out_false = pl.from_pandas(df, nan_to_none=False) df.loc[2] = pd.NA assert [val is None for val in out_true] assert [np.isnan(val) for val in out_false[1:]] with pytest.raises(ArrowInvalid, match="Could not convert"): pl.from_pandas(df, nan_to_none=False)
def test_from_pandas_nested_list() -> None: # this panicked in https://github.com/pola-rs/polars/issues/1615 pddf = pd.DataFrame( {"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]} ) pldf = pl.from_pandas(pddf) assert pldf.shape == (4, 2)
def test_from_pandas_datetime(): ts = datetime.datetime(2021, 1, 1, 20, 20, 20, 20) s = pd.Series([ts, ts]) s = pl.from_pandas(s.to_frame("a"))["a"] assert s.hour()[0] == 20 assert s.minute()[0] == 20 assert s.second()[0] == 20
def build_gene_annotation_df(pset_dict): """ Build a table mapping each gene in a dataset to its gene annotations. @param pset_dict: [`dict`] A nested dictionary containing all tables in the PSet @return: [`DataFrame`] A table of all gene annotations, mapped to genes """ # Extract the all molecular data types for the pSet df_list = [ pl.from_pandas(pset_dict['molecularProfiles'][mDataType]['rowData']) for mDataType in pset_dict['molecularProfiles'] ] # Get columns of interest, add columns needed later for i in range(len(df_list)): df_list[i] = df_list[i].select(['.features']) empty_column = [None for _ in range(len(df_list[i]['.features']))] df_list[i]['symbol'] = pl.Series('symbol', empty_column, dtype=pl.Utf8) df_list[i]['gene_seq_start'] = pl.Series('gene_seq_start', empty_column, dtype=pl.Int64) df_list[i]['gene_seq_end'] = pl.Series('gene_seq_end', empty_column, dtype=pl.Int64) # Merge to a single DataFrame gene_annotation_df = pl.concat(df_list) \ .rename({'.features': 'gene_id'}) # Remove Ensembl gene version gene_annotation_df['gene_id'] = gene_annotation_df['gene_id'] \ .apply(lambda x: re.sub(r'\..*$', '', x)) gene_annotation_df = gene_annotation_df \ .drop_duplicates() \ .to_pandas() return gene_annotation_df
def test_struct_to_pandas() -> None: df = pd.DataFrame([{"a": {"b": {"c": 2}}}]) pl_df = pl.from_pandas(df) assert isinstance(pl_df.dtypes[0], pl.datatypes.Struct) assert pl_df.to_pandas().equals(df)
def test_from_empty_pandas() -> None: pandas_df = pd.DataFrame({ "A": [], "fruits": [], }) polars_df = pl.from_pandas(pandas_df) assert polars_df.columns == ["A", "fruits"] assert polars_df.dtypes == [pl.Float64, pl.Float64]
def test_from_pandas_ns_resolution() -> None: df = pd.DataFrame( [ pd.Timestamp( year=2021, month=1, day=1, hour=1, second=1, nanosecond=1) ], columns=["date"], ) assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)
def test_from_pandas_datetime() -> None: ts = datetime(2021, 1, 1, 20, 20, 20, 20) pl_s = pd.Series([ts, ts]) tmp = pl.from_pandas(pl_s.to_frame("a")) s = tmp["a"] assert s.dt.hour()[0] == 20 assert s.dt.minute()[0] == 20 assert s.dt.second()[0] == 20 date_times = pd.date_range( "2021-06-24 00:00:00", "2021-06-24 10:00:00", freq="1H", closed="left" ) s = pl.from_pandas(date_times) assert s[0] == datetime(2021, 6, 24, 0, 0) assert s[-1] == datetime(2021, 6, 24, 9, 0) df = pd.DataFrame({"datetime": ["2021-01-01", "2021-01-02"], "foo": [1, 2]}) df["datetime"] = pd.to_datetime(df["datetime"]) pl.from_pandas(df)
def test_cast_inner() -> None: a = pl.Series([[1, 2]]) for t in [bool, pl.Boolean]: b = a.cast(pl.List(t)) assert b.dtype == pl.List(pl.Boolean) assert b.to_list() == [[True, True]] # this creates an inner null type df = pl.from_pandas(pd.DataFrame(data=[[[]], [[]]], columns=["A"])) assert df["A"].cast(pl.List(int)).dtype.inner == pl.Int64 # type: ignore[arg-type, attr-defined]
def test_from_pandas_datetime(): ts = datetime.datetime(2021, 1, 1, 20, 20, 20, 20) s = pd.Series([ts, ts]) s = pl.from_pandas(s.to_frame("a"))["a"] assert s.dt.hour()[0] == 20 assert s.dt.minute()[0] == 20 assert s.dt.second()[0] == 20 date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 10:00:00", freq="1H", closed="left") s = pl.from_pandas(date_times) assert s[0] == 1624492800000 assert s[-1] == 1624525200000 # checks dispatch s.dt.round("hour", 2) s.dt.round("day", 5) # checks lazy dispatch pl.DataFrame([s.rename("foo")])[pl.col("foo").dt.round("hour", 2)]
def test_join_dates(): date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 10:00:00", freq="1H", closed="left") dts = (pl.from_pandas(date_times).apply( lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60)).cast( pl.Date64)) # some df with sensor id, (randomish) datetime and some value df = pl.DataFrame({ "sensor": ["a"] * 5 + ["b"] * 5, "datetime": dts, "value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3], }) df.join(df, on="datetime")
def test_from_pandas(): df = pd.DataFrame({ "bools": [False, True, False], "bools_nulls": [None, True, False], "int": [1, 2, 3], "int_nulls": [1, None, 3], "floats": [1.0, 2.0, 3.0], "floats_nulls": [1.0, None, 3.0], "strings": ["foo", "bar", "ham"], "strings_nulls": ["foo", None, "ham"], "strings-cat": ["foo", "bar", "ham"], }) df["strings-cat"] = df["strings-cat"].astype("category") out = pl.from_pandas(df) assert out.shape == (3, 9)
def test_struct_logical_types_to_pandas() -> None: timestamp = datetime(2022, 1, 1) df = pd.DataFrame([{"struct": {"timestamp": timestamp}}]) assert pl.from_pandas(df).dtypes == [pl.Struct]
def test_from_null_column() -> None: assert pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA])).shape == (2, 1)
def test_from_empty_pandas_strings() -> None: df = pd.DataFrame(columns=["a", "b"]) df["a"] = df["a"].astype(str) df["b"] = df["b"].astype(float) df_pl = pl.from_pandas(df) assert df_pl.dtypes == [pl.Utf8, pl.Float64]
def test_from_pandas_series() -> None: pd_series = pd.Series([1, 2, 3], name="pd") df = pl.from_pandas(pd_series) assert df.shape == (3, )
def test_from_pandas_categorical_none() -> None: s = pd.Series(["a", "b", "c", pd.NA], dtype="category") out = pl.from_pandas(s) assert out.dtype == pl.Categorical assert out.to_list() == ["a", "b", "c", None]