Python CategoricalDtype.CategoricalDtype示例，pandas.CategoricalDtype.CategoricalDtype Python示例

示例#1

0

显示文件

def test_data_catalog_queries(catalog):
    result = catalog.trade_ticks().dtypes.to_dict()
    expected = {
        "aggressor_side":
        CategoricalDtype(categories=["UNKNOWN"], ordered=False),
        "instrument_id":
        CategoricalDtype(
            categories=[
                "Basketball,,29628709,20191221-001000,ODDS,MATCH_ODDS,1.166564490,237491,.BETFAIR",
                "Basketball,,29628709,20191221-001000,ODDS,MATCH_ODDS,1.166564490,60424,.BETFAIR",
            ],
            ordered=False,
        ),
        "match_id":
        dtype("O"),
        "price":
        dtype("float64"),
        "size":
        dtype("float64"),
        "ts_event_ns":
        dtype("int64"),
        "ts_recv_ns":
        dtype("int64"),
        "type":
        CategoricalDtype(categories=["TradeTick"], ordered=False),
    }
    assert result == expected

示例#2

0

显示文件

文件： test_astype.py 项目： ukarroum/pandas

    def test_astype_categorical_to_categorical(
        self, name, dtype_ordered, series_ordered
    ):
        # GH#10696, GH#18593
        s_data = list("abcaacbab")
        s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered)
        s = Series(s_data, dtype=s_dtype, name=name)

        # unspecified categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = s.astype(dtype)
        exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
        expected = Series(s_data, name=name, dtype=exp_dtype)
        tm.assert_series_equal(result, expected)

        # different categories
        dtype = CategoricalDtype(list("adc"), dtype_ordered)
        result = s.astype(dtype)
        expected = Series(s_data, name=name, dtype=dtype)
        tm.assert_series_equal(result, expected)

        if dtype_ordered is False:
            # not specifying ordered, so only test once
            expected = s
            result = s.astype("category")
            tm.assert_series_equal(result, expected)

示例#3

0

显示文件

def create_schema(phases, sources):
    dataframe_fields_types = {
        "name": StringDtype(),
        "schema": CategoricalDtype(settings.SCHEMAS),
        "collection_id": StringDtype(),
        "id": StringDtype(),
        "country": StringDtype(),
        "address": StringDtype(),
        "registrationNumber": StringDtype(),
        "alias": StringDtype(),
        "status": StringDtype(),
        "classification": StringDtype(),
        "gender": StringDtype(),
        "firstName": StringDtype(),
        "lastName": StringDtype(),
        "birthPlace": StringDtype(),
        "birthDate": StringDtype(),
        "idNumber": StringDtype(),
        "motherName": StringDtype(),
        "nationality": StringDtype(),
    }
    dataframe_meta = {
        f"{which}_{c}": t
        for which in ("left", "right")
        for c, t in dataframe_fields_types.items()
    }
    dataframe_meta["judgement"] = bool
    dataframe_meta["source"] = CategoricalDtype(sources)
    dataframe_meta["phase"] = CategoricalDtype(phases)
    dataframe_meta["features"] = object
    dataframe_meta["schema"] = StringDtype()
    return dataframe_meta

示例#4

0

显示文件

文件： test_astype.py 项目： AlexeyDzyubaP/LinearReg

    def test_astype_category(self, name, dtype_ordered, index_ordered):
        # GH#18630
        index = CategoricalIndex(
            list("aabbca"), categories=list("cab"), ordered=index_ordered
        )
        if name:
            index = index.rename(name)

        # standard categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(
            index.tolist(),
            name=name,
            categories=index.categories,
            ordered=dtype_ordered,
        )
        tm.assert_index_equal(result, expected)

        # non-standard categories
        dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
        result = index.astype(dtype)
        expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
        tm.assert_index_equal(result, expected)

        if dtype_ordered is False:
            # dtype='category' can't specify ordered, so only test once
            result = index.astype("category")
            expected = index
            tm.assert_index_equal(result, expected)

示例#5

0

显示文件

文件： test_astype.py 项目： ukarroum/pandas

    def test_astype_from_categorical_with_keywords(self):
        # with keywords
        lst = ["a", "b", "c", "a"]
        ser = Series(lst)
        exp = Series(Categorical(lst, ordered=True))
        res = ser.astype(CategoricalDtype(None, ordered=True))
        tm.assert_series_equal(res, exp)

        exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True))
        res = ser.astype(CategoricalDtype(list("abcdef"), ordered=True))
        tm.assert_series_equal(res, exp)

示例#6

0

显示文件

文件： FeatureEncoding.py 项目： JamesMcGuigan/kaggle-house-prices

    def X_feature_onehot(self, dataframe: DataFrame) -> DataFrame:
        # fieldgroups[basename] = [ fieldname ]
        # noinspection PyArgumentList
        fieldgroups = groupby(
            curry(re.sub)('\d+(st|nd|rd)?$')(''),  # basename
            self.params['X_feature_onehot']  # fieldnames
        )
        encodings = {}
        for basename, fieldnames in fieldgroups.items():
            # NOTE: in theory, unique_values should be hardcoded based on data_description.txt
            #       for Kaggle, we can cheat and just take unique_values from self.data['combined']
            # BUGFIX: running to_X() separately on test/train/validate datasets results in column name mismatches
            unique_values = np.unique(
                self.data['combined'][fieldnames].dropna().values)
            category_dtype = CategoricalDtype(categories=unique_values)

            for fieldname in fieldnames:
                dataframe[fieldname] = dataframe[fieldname].astype(
                    category_dtype)
                onehot = pd.get_dummies(dataframe[fieldname],
                                        prefix=basename,
                                        prefix_sep='_')
                if not basename in encodings: encodings[basename] = onehot
                else:
                    encodings[basename] = onehot & encodings[
                        basename]  # Bitwise addition

        # Add additional onehot columns to dataframe
        for basename, onehot in encodings.items():
            dataframe = dataframe.join(onehot)

        # Mark original categorical columns for exclusion
        self.params['X_feature_exclude'] += self.params['X_feature_onehot']
        return dataframe

示例#7

0

显示文件

    def test_construction_with_categorical_dtype(self):
        # construction with CategoricalDtype
        # GH#18109
        data, cats, ordered = "a a b b".split(), "c b a".split(), True
        dtype = CategoricalDtype(categories=cats, ordered=ordered)

        result = CategoricalIndex(data, dtype=dtype)
        expected = CategoricalIndex(data, categories=cats, ordered=ordered)
        tm.assert_index_equal(result, expected, exact=True)

        # GH#19032
        result = Index(data, dtype=dtype)
        tm.assert_index_equal(result, expected, exact=True)

        # error when combining categories/ordered and dtype kwargs
        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
        with pytest.raises(ValueError, match=msg):
            CategoricalIndex(data, categories=cats, dtype=dtype)

        with pytest.raises(ValueError, match=msg):
            Index(data, categories=cats, dtype=dtype)

        with pytest.raises(ValueError, match=msg):
            CategoricalIndex(data, ordered=ordered, dtype=dtype)

        with pytest.raises(ValueError, match=msg):
            Index(data, ordered=ordered, dtype=dtype)

示例#8

0

显示文件

文件： bms_table.py 项目： naktazdim/lr2irscraper

    def to_dataframe(self) -> pd.DataFrame:
        """次期難易度表フォーマットのデータを DataFrame として返す。

        "level" カラムは Categorical, 他のカラムはすべて文字列 (object 型) とする。
        表記レベルの先頭にはシンボルを付加する (たとえば "▼0")。
        欠損値は空文字列とする。

        :return: DataFrame
        """
        assert self.data is not None

        if len(self.data) == 0:
            # 空の場合も、仕様上の必須カラムは用意しておく。"level" カラムは存在しないと以下の処理で困る
            table = pd.DataFrame(columns=["md5", "level"], dtype=object)
        else:
            table = pd.DataFrame.from_dict(self.data, dtype=object).fillna("")

        tag = self.header.get("tag") or self.header["symbol"]
        level_order = self.header.get(
            "level_order") or table["level"].drop_duplicates().values
        level_order = list(map(
            str,
            level_order))  # 仕様では Array(String | Integer) となっている。str に統一しておく。

        return (table.astype({
            "level": str
        })  # 仕様では str なのだが、int が入っていることがある (例: 新 Overjoy) ので str に統一しておく
                .astype({
                    "level":
                    CategoricalDtype(categories=level_order, ordered=True)
                }).assign(level=lambda df: df["level"].cat.rename_categories(
                    [tag + level for level in level_order])))

示例#9

0

显示文件

文件： test_at.py 项目： 0benson0/stock-knowledge-graph

    def test_at_setitem_categorical_missing(self):
        df = DataFrame(index=range(3),
                       columns=range(3),
                       dtype=CategoricalDtype(["foo", "bar"]))
        df.at[1, 1] = "foo"

        expected = DataFrame(
            [
                [np.nan, np.nan, np.nan],
                [np.nan, "foo", np.nan],
                [np.nan, np.nan, np.nan],
            ],
            dtype=CategoricalDtype(["foo", "bar"]),
        )

        tm.assert_frame_equal(df, expected)

示例#10

0

显示文件

文件： test_astype.py 项目： ukarroum/pandas

    def test_astype_categoricaldtype(self):
        s = Series(["a", "b", "a"])
        result = s.astype(CategoricalDtype(["a", "b"], ordered=True))
        expected = Series(Categorical(["a", "b", "a"], ordered=True))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(["a", "b"], ordered=False))
        expected = Series(Categorical(["a", "b", "a"], ordered=False))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False))
        expected = Series(
            Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False)
        )
        tm.assert_series_equal(result, expected)
        tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"]))

示例#11

0

显示文件

文件： tasks.py 项目： codeaudit/humannotator

 def __init__(self, *args, categories=None, **kwargs):
     if not isinstance(categories, Mapping):
         categories = {str(i): c for i, c in enumerate(categories, start=1)}
     self.categories = categories
     self.dtype = CategoricalDtype(self.categories.values(), ordered=None)
     self.items = ''.join(option(i, c)
                          for i, c in categories.items()).strip('\n')
     super().__init__(*args, **kwargs)

示例#12

0

显示文件

文件： test_astype.py 项目： ukarroum/pandas

    def test_astype_bool_missing_to_categorical(self):
        # GH-19182
        s = Series([True, False, np.nan])
        assert s.dtypes == np.object_

        result = s.astype(CategoricalDtype(categories=[True, False]))
        expected = Series(Categorical([True, False, np.nan], categories=[True, False]))
        tm.assert_series_equal(result, expected)

示例#13

0

显示文件

文件： test_london_scraper_output.py 项目： michaelwalshe/scrape_london_marathon

def scraper_output():
    """Can't run the scraper just for data tests, so these check that
    the latest output works. This reads in that data and provides it to
    pytest tests"""
    results = pd.read_csv(
        f"{params.ROOT}/../data/london_marathon_latest.csv",
        dtype={
            "Place (Overall)":
            "Int64",
            "Place (Gender)":
            "Int64",
            "Name":
            str,
            "Sex":
            str,
            "Club":
            str,
            "Running Number":
            object,
            "Category":
            CategoricalDtype(
                categories=[
                    "18-39",
                    "40-44",
                    "45-49",
                    "50-54",
                    "55-59",
                    "60-64",
                    "65-69",
                    "70+",
                    "70-74",
                    "75-79",
                    "80-84",
                    "85+",
                    "80+",
                    "Unknown",
                ],
                ordered=False,
            ),
            "Year":
            "Int64",
            "Country":
            str,
            "FirstName":
            str,
            "LastName":
            str,
            "DSQ":
            bool,
            "Finish (Total Seconds)":
            "float64",
        },
        parse_dates=["Finish"],
    )

    results["Finish"] = pd.to_timedelta(results["Finish"])

    return results

示例#14

0

显示文件

    def test_astype_str_int_categories_to_nullable_int(self):
        # GH#39616
        dtype = CategoricalDtype([str(i) for i in range(5)])
        codes = np.random.randint(5, size=20)
        arr = Categorical.from_codes(codes, dtype=dtype)

        res = arr.astype("Int64")
        expected = array(codes, dtype="Int64")
        tm.assert_extension_array_equal(res, expected)

示例#15

0

显示文件

文件： test_iloc.py 项目： lordgrenville/pandas

 def test_setitem_mix_of_nan_and_interval(self, not_na, nulls_fixture):
     # GH#27937
     dtype = CategoricalDtype(categories=[not_na])
     ser = Series(
         [nulls_fixture, nulls_fixture, nulls_fixture, nulls_fixture], dtype=dtype
     )
     ser.iloc[:3] = [nulls_fixture, not_na, nulls_fixture]
     exp = Series([nulls_fixture, not_na, nulls_fixture, nulls_fixture], dtype=dtype)
     tm.assert_series_equal(ser, exp)

示例#16

0

显示文件

文件： test_update.py 项目： zoehuang7/pandas

 def test_update_with_categorical_type(self):
     # GH 25744
     dtype = CategoricalDtype(["a", "b", "c", "d"])
     s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype)
     s2 = Series(["b", "a"], index=[1, 2], dtype=dtype)
     s1.update(s2)
     result = s1
     expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype)
     tm.assert_series_equal(result, expected)

示例#17

0

显示文件

文件： test_io.py 项目： vishalbelsare/hpat

 def test_impl():
     ct_dtype = CategoricalDtype(['A', 'B', 'C', 'D'])
     df = pd.read_csv("csv_data_cat1.csv",
                      names=['C1', 'C2', 'C3'],
                      dtype={
                          'C1': int_type,
                          'C2': ct_dtype,
                          'C3': str
                      })
     return df

示例#18

0

显示文件

文件： replay_analysis.py 项目： samklr/amazon-redshift-utils

def read_data(table_name, df, report_columns, report):
    """Map raw data file to formatted table

    @param table_name: name of table
    @param df: DataFrame of raw data
    @param report_columns: List of column names for report table
    @param report: Report object
    @return: DataFrame of formatted data
    """

    logger = logging.getLogger("SimpleReplayLogger")

    if df.empty:
        logger.error("Data is empty. Failed to generate report.")
        exit(-1)
    cols = [g_columns[x] for x in report_columns]
    table_type = report.tables.get(table_name).get('type')

    report_table = None
    if table_type == 'breakdown':
        report_table = df[cols]
    elif table_type == 'metric':
        order = CategoricalDtype([
            'Query Latency', 'Compile Time', 'Queue Time', 'Execution Time',
            'Commit Queue Time', 'Commit Time'
        ],
                                 ordered=True)
        df[g_columns.get('Measure')] = df[g_columns.get('Measure')].astype(
            order)
        frame = df.sort_values(g_columns.get('Measure'))
        report_table = frame[cols]
    elif table_type == 'measure':  # filter for specific measure type
        report_table = df[cols][df[g_columns.get("Measure")] == table_name]

    report_table = pd.DataFrame(report_table).round(
        2)  # round values in dataframe to thousandths place
    report_table.reindex(
        columns=report_columns)  # add columns names to dataframe

    # upload formatted dataframe to S3 as csv
    try:
        s3_resource = boto3.resource('s3')
        file = f"{table_name.replace(' ', '')}.csv"  # set filename for saving
        csv_buffer = StringIO()
        report_table.to_csv(csv_buffer)
        logger.debug(report.bucket)
        s3_resource.Object(report.bucket.get("bucket_name"),
                           f'{report.path}/aggregated_data/{file}').put(
                               Body=csv_buffer.getvalue())
    except Exception as e:
        logger.error(
            f"Could not upload aggregated data. Please confirm bucket. Error occurred while processing "
            f"data. {e}")
        exit(-1)
    return report_table

示例#19

0

显示文件

    def test_astype_categorical_to_other(self):
        cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
        ser = Series(np.random.RandomState(0).randint(0, 10000,
                                                      100)).sort_values()
        ser = cut(ser, range(0, 10500, 500), right=False, labels=cat)

        expected = ser
        tm.assert_series_equal(ser.astype("category"), expected)
        tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
        msg = r"Cannot cast object dtype to float64"
        with pytest.raises(ValueError, match=msg):
            ser.astype("float64")

        cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        tm.assert_series_equal(cat.astype("str"), exp)
        s2 = Series(Categorical(["1", "2", "3", "4"]))
        exp2 = Series([1, 2, 3, 4]).astype("int")
        tm.assert_series_equal(s2.astype("int"), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(ser.values), name="value_group")
        cmp(ser.astype("object"), expected)
        cmp(ser.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(ser), np.array(ser.values))

        tm.assert_series_equal(ser.astype("category"), ser)
        tm.assert_series_equal(ser.astype(CategoricalDtype()), ser)

        roundtrip_expected = ser.cat.set_categories(
            ser.cat.categories.sort_values()).cat.remove_unused_categories()
        result = ser.astype("object").astype("category")
        tm.assert_series_equal(result, roundtrip_expected)
        result = ser.astype("object").astype(CategoricalDtype())
        tm.assert_series_equal(result, roundtrip_expected)

示例#20

0

显示文件

    def test_iloc_getitem_categorical_values(self):
        # GH#14580
        # test iloc() on Series with Categorical data

        ser = Series([1, 2, 3]).astype("category")

        # get slice
        result = ser.iloc[0:2]
        expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

        # get list of indexes
        result = ser.iloc[[0, 1]]
        expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

        # get boolean array
        result = ser.iloc[[True, False, False]]
        expected = Series([1]).astype(CategoricalDtype([1, 2, 3]))
        tm.assert_series_equal(result, expected)

示例#21

0

显示文件

    def test_ordinal_encode_category(self):
        df2 = pd.DataFrame([
            ['C', '3'],
            ['D', '4'],
            ['D', '4'],
        ],
                           columns=['alpha', 'digits'])
        df2["digits"] = df2["digits"].astype(
            CategoricalDtype(categories=["4", "3"], ordered=True))
        df2["alpha"] = df2["alpha"].astype(
            CategoricalDtype(categories=["D", "C"], ordered=True))
        df2_ = df2.loc[1:, :]
        df2_1 = df2.loc[:1, :]
        df2_c = pd.concat([df2_, df2_1])
        df2_c.index = range(4)
        encoder = OrdinalEncoder()

        encoder.in_feature_groups = "cat"
        encoder.out_feature_groups = "ordinal"
        # RunFeatureSelection().test_univar_clf()
        # RunCoding().test_procedure()
        dc = DataFrameContainer(dataset_instance=df2_c)
        dc.set_feature_groups(["cat"] * 2)
        encoder.fit(X_train=dc)
        result = encoder.transform(X_train=dc)["X_train"]
        print(result)
        should_be = pd.DataFrame({
            'alpha': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            },
            'digits': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            }
        })
        assert np.all(result.data == should_be)

示例#22

0

显示文件

文件： ArffManager.py 项目： itayh1/ExploreKitPy

 def _get_dataframe_by_attrs(data, attributes):
     df = pd.DataFrame(data,
                       columns=[attr_name for attr_name, _ in attributes])
     for attr_name, attr_type in attributes:
         if type(attr_type) == str:
             if attr_type.upper() in ['NUMERIC', 'REAL']:
                 df[attr_name] = df[attr_name].astype(float)
         elif type(attr_type) == list:
             df[attr_name] = df[attr_name].astype(
                 CategoricalDtype(attr_type))
         else:
             raise Exception(
                 f'Unknown attribute type while loading arff: "{attr_type}"'
             )
     return df

示例#23

0

显示文件

    def test_astype_category(self, dtype_ordered, cat_ordered):
        # GH#10696/GH#18593
        data = list("abcaacbab")
        cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)

        # standard categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = cat.astype(dtype)
        expected = Categorical(data,
                               categories=cat.categories,
                               ordered=dtype_ordered)
        tm.assert_categorical_equal(result, expected)

        # non-standard categories
        dtype = CategoricalDtype(list("adc"), dtype_ordered)
        result = cat.astype(dtype)
        expected = Categorical(data, dtype=dtype)
        tm.assert_categorical_equal(result, expected)

        if dtype_ordered is False:
            # dtype='category' can't specify ordered, so only test once
            result = cat.astype("category")
            expected = cat
            tm.assert_categorical_equal(result, expected)

示例#24

0

显示文件

    def test_astype_categorical_retains_ordered(self, ordered):
        index = IntervalIndex.from_breaks(range(5))
        arr = index._data

        dtype = CategoricalDtype(None, ordered=ordered)

        expected = Categorical(list(arr), ordered=ordered)
        result = arr.astype(dtype)
        assert result.ordered is ordered
        tm.assert_categorical_equal(result, expected)

        # test IntervalIndex.astype while we're at it.
        result = index.astype(dtype)
        expected = Index(expected)
        tm.assert_index_equal(result, expected)

示例#25

0

显示文件

文件： test_sort_index.py 项目： GabrielUlisses/pandas

    def test_sort_index_categorical_index(self):

        df = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
        }).set_index("B")

        result = df.sort_index()
        expected = df.iloc[[4, 0, 1, 5, 2, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(ascending=False)
        expected = df.iloc[[2, 3, 0, 1, 5, 4]]
        tm.assert_frame_equal(result, expected)

示例#26

0

显示文件

文件： test_analytics.py 项目： tnir/pandas

    def test_unique_index_series(self, ordered):
        # GH38140
        dtype = CategoricalDtype([3, 2, 1], ordered=ordered)

        c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
        # Categorical.unique sorts categories by appearance order
        # if ordered=False
        exp = Categorical([3, 1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)

        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)

        c = Categorical([1, 1, 2, 2], dtype=dtype)
        exp = Categorical([1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)
        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)

示例#27

0

显示文件

文件： FeatureEncoding.py 项目： JamesMcGuigan/kaggle-house-prices

    def X_feature_label_encode(self, dataframe: DataFrame) -> DataFrame:
        for label_string, fieldnames in self.params[
                'X_feature_label_encode'].items():
            labels = label_string.split(',')
            category_dtype = CategoricalDtype(categories=labels, ordered=True)

            encoder = LabelEncoder()
            encoder.fit(labels)
            for fieldname in fieldnames:
                # Replace NaN with first label 'NA', encoder.transform() will throw exception on unseen values
                dataframe[fieldname] = dataframe[fieldname].astype(
                    category_dtype)
                dataframe[fieldname].fillna(labels[0], inplace=True)
                dataframe[f"{fieldname}_Numeric"] = encoder.transform(
                    dataframe[fieldname])

        self.params['X_feature_exclude'] += list(
            flatten(self.params['X_feature_label_encode'].values()))
        return dataframe

示例#28

0

显示文件

文件： random_t.py 项目： SHUYUAN66/COGS-118A-Final

def clean_nsr(df):
    od = developer
    nsr_var = [
        'parents', 'has_nurs', 'form', 'children', 'housing', 'finance',
        'social', 'health', 'target'
    ]
    df.columns = nsr_var
    raw = df.copy()
    #raw = raw.replace({'inconv': 0, 'convenient': 1})
    df = df.replace('_', '', regex=True)
    #df = df.replace(' ', '', regex=True)
    df = df.drop(columns=['finance'])
    for i in df.columns:
        df[i] = df[i].astype('category')
        r = od[i]
        cat_r = CategoricalDtype(categories=r, ordered=True)
        # give the order
        df[i] = df[i].cat.reorder_categories(r, ordered=True)

    df['finance'] = raw['finance']
    return df

示例#29

0

显示文件

文件： test_analytics.py 项目： tnir/pandas

    def test_unique(self, ordered):
        # GH38140
        dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)

        # categories are reordered based on value when ordered=False
        cat = Categorical(["a", "b", "c"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, cat)

        cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))

        cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)

        # nan must be removed
        cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)

示例#30

0

显示文件

class TestAstype:
    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    @td.skip_array_manager_not_yet_implemented
    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see GH#14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame({
            "a":
            list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b":
            list(map(str, map(Timestamp, b._values))),
            "c":
            list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
            "d":
            list(map(str, d._values)),
            "e":
            list(map(str, e._values)),
        })

        tm.assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see GH#11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        tm.assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        val = "1.1234567890123457"
        expected = DataFrame([val])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series(["0", "1", "2", "3", "4"]),
            "c":
            c,
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
            "c":
            Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        tm.assert_frame_equal(df.astype(dt3), df.astype(str))
        tm.assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = "Only a column name can be used for the key in a dtype mappings argument"
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        tm.assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

        # GH#16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({}) if dtype_class is dict else dtype_class(
            {}, dtype=object)
        equiv = df.astype(dt7)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"],
                       dtype=str,
                       name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH#18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = f"Expected an instance of {cls.__name__}"
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH#22578
        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])

        expected1 = DataFrame({
            "a": pd.array([1, 3, 5], dtype=dtype),
            "b": pd.array([2, 4, 6], dtype=dtype),
        })
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = DataFrame({
            "a": [1.0, 3.0, 5.0],
            "b": pd.array([2, 4, 6], dtype=dtype)
        })
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH#22578
        df = DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH#24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", [{
        100: "float64",
        200: "uint64"
    }, "category", "float64"])
    def test_astype_column_metadata(self, dtype):
        # GH#19920
        columns = UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_object(self, dtype, unit):
        # tests astype to object dtype
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == Timestamp(1, unit=unit)
        else:
            assert result.iloc[0, 0] == Timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # GH#19223
        dtype = f"M8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # GH#19224
        dtype = f"M8[{unit}]"
        other = f"m8[{unit}]"

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]"
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]"
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    @td.skip_array_manager_not_yet_implemented
    def test_astype_arg_for_errors(self):
        # GH#14878

        df = DataFrame([1, 2, 3])

        msg = ("Expected value of kwarg 'errors' to be one of "
               "['raise', 'ignore']. Supplied value is 'True'")
        with pytest.raises(ValueError, match=re.escape(msg)):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_astype_arg_for_errors_dictlist(self):
        # GH#25905
        df = DataFrame([
            {
                "a": "1",
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": "2.2",
                "b": "15.3",
                "c": "another_test"
            },
        ])
        expected = DataFrame([
            {
                "a": 1.0,
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": 2.2,
                "b": "15.3",
                "c": "another_test"
            },
        ])
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz(self, timezone_frame):
        # astype
        expected = np.array(
            [
                [
                    Timestamp("2013-01-01 00:00:00"),
                    Timestamp("2013-01-02 00:00:00"),
                    Timestamp("2013-01-03 00:00:00"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
                ],
            ],
            dtype=object,
        ).T
        expected = DataFrame(
            expected,
            index=timezone_frame.index,
            columns=timezone_frame.columns,
            dtype=object,
        )
        result = timezone_frame.astype(object)
        tm.assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            # dt64tz->dt64 deprecated
            result = timezone_frame.astype("datetime64[ns]")
        expected = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            (date_range("20130101", periods=3,
                        tz="US/Eastern").tz_convert("UTC").tz_localize(None)),
            "C": (date_range("20130101", periods=3,
                             tz="CET").tz_convert("UTC").tz_localize(None)),
        })
        expected.iloc[1, 1] = NaT
        expected.iloc[1, 2] = NaT
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz_to_str(self, timezone_frame):
        # str formatting
        result = timezone_frame.astype(str)
        expected = DataFrame(
            [
                [
                    "2013-01-01",
                    "2013-01-01 00:00:00-05:00",
                    "2013-01-01 00:00:00+01:00",
                ],
                ["2013-01-02", "NaT", "NaT"],
                [
                    "2013-01-03",
                    "2013-01-03 00:00:00-05:00",
                    "2013-01-03 00:00:00+01:00",
                ],
            ],
            columns=timezone_frame.columns,
        )
        tm.assert_frame_equal(result, expected)

        with option_context("display.max_columns", 20):
            result = str(timezone_frame)
            assert (
                "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
            ) in result
            assert (
                "1 2013-01-02                       NaT                       NaT"
            ) in result
            assert (
                "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
            ) in result

    def test_astype_empty_dtype_dict(self):
        # issue mentioned further down in the following issue's thread
        # https://github.com/pandas-dev/pandas/issues/33113
        df = DataFrame()
        result = df.astype({})
        tm.assert_frame_equal(result, df)
        assert result is not df

    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) ignore keyword
    @pytest.mark.parametrize(
        "df",
        [
            DataFrame(Series(["x", "y", "z"], dtype="string")),
            DataFrame(Series(["x", "y", "z"], dtype="category")),
            DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
            DataFrame(Series(3 * [Interval(0, 1)])),
        ],
    )
    @pytest.mark.parametrize("errors", ["raise", "ignore"])
    def test_astype_ignores_errors_for_extension_dtypes(self, df, errors):
        # https://github.com/pandas-dev/pandas/issues/35471
        if errors == "ignore":
            expected = df
            result = df.astype(float, errors=errors)
            tm.assert_frame_equal(result, expected)
        else:
            msg = "(Cannot cast)|(could not convert)"
            with pytest.raises((ValueError, TypeError), match=msg):
                df.astype(float, errors=errors)

    def test_astype_tz_conversion(self):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        df = DataFrame(val)
        result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})

        expected = df
        expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
    def test_astype_tz_object_conversion(self, tz):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        expected = DataFrame(val)

        # convert expected to object dtype from other tz str (independently tested)
        result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
        result = result.astype({"tz": "object"})

        # do real test: object dtype to a specified tz, different from construction tz.
        result = result.astype({"tz": "datetime64[ns, Europe/London]"})
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture,
                                   request):
        tz = tz_naive_fixture
        if tz is None:
            mark = pytest.mark.xfail(
                reason=
                "GH#36153 uses ndarray formatting instead of DTA formatting")
            request.node.add_marker(mark)

        dti = date_range("2016-01-01", periods=3, tz=tz)
        dta = dti._data
        dta[0] = NaT

        obj = frame_or_series(dta)
        result = obj.astype("string")

        # Check that Series/DataFrame.astype matches DatetimeArray.astype
        expected = frame_or_series(dta.astype("string"))
        tm.assert_equal(result, expected)

        item = result.iloc[0]
        if frame_or_series is DataFrame:
            item = item.iloc[0]
        assert item is pd.NA

        # For non-NA values, we should match what we get for non-EA str
        alt = obj.astype(str)
        assert np.all(alt.iloc[1:] == result.iloc[1:])

    def test_astype_bytes(self):
        # GH#39474
        result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
        assert result.dtypes[0] == np.dtype("S3")