Python CategoricalDtype примеры использования

Язык программирования: Python

Пространство имен/Пакет: pandas

Класс/Тип: CategoricalDtype

Примеров на hotexamples.com: 27

Python CategoricalDtype - 27 примеров найдено. Это лучшие примеры Python кода для pandas.CategoricalDtype, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CategoricalDtype(30)

Основные методы

CategoricalDtype (30)

Пример #1

Показать файл

Файл: test_sort_index.py Проект: xl3024/pandas

    def test_sort_index_categorical_index(self):

        df = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
        }).set_index("B")

        result = df.sort_index()
        expected = df.iloc[[4, 0, 1, 5, 2, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(ascending=False)
        expected = df.iloc[[2, 3, 0, 1, 5, 4]]
        tm.assert_frame_equal(result, expected)

Пример #2

Показать файл

Файл: test_analytics.py Проект: tnir/pandas

    def test_unique_index_series(self, ordered):
        # GH38140
        dtype = CategoricalDtype([3, 2, 1], ordered=ordered)

        c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
        # Categorical.unique sorts categories by appearance order
        # if ordered=False
        exp = Categorical([3, 1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)

        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)

        c = Categorical([1, 1, 2, 2], dtype=dtype)
        exp = Categorical([1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)
        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)

Пример #3

Показать файл

Файл: FeatureEncoding.py Проект: JamesMcGuigan/kaggle-house-prices

    def X_feature_label_encode(self, dataframe: DataFrame) -> DataFrame:
        for label_string, fieldnames in self.params[
                'X_feature_label_encode'].items():
            labels = label_string.split(',')
            category_dtype = CategoricalDtype(categories=labels, ordered=True)

            encoder = LabelEncoder()
            encoder.fit(labels)
            for fieldname in fieldnames:
                # Replace NaN with first label 'NA', encoder.transform() will throw exception on unseen values
                dataframe[fieldname] = dataframe[fieldname].astype(
                    category_dtype)
                dataframe[fieldname].fillna(labels[0], inplace=True)
                dataframe[f"{fieldname}_Numeric"] = encoder.transform(
                    dataframe[fieldname])

        self.params['X_feature_exclude'] += list(
            flatten(self.params['X_feature_label_encode'].values()))
        return dataframe

Пример #4

Показать файл

Файл: random_t.py Проект: SHUYUAN66/COGS-118A-Final

def clean_nsr(df):
    od = developer
    nsr_var = [
        'parents', 'has_nurs', 'form', 'children', 'housing', 'finance',
        'social', 'health', 'target'
    ]
    df.columns = nsr_var
    raw = df.copy()
    #raw = raw.replace({'inconv': 0, 'convenient': 1})
    df = df.replace('_', '', regex=True)
    #df = df.replace(' ', '', regex=True)
    df = df.drop(columns=['finance'])
    for i in df.columns:
        df[i] = df[i].astype('category')
        r = od[i]
        cat_r = CategoricalDtype(categories=r, ordered=True)
        # give the order
        df[i] = df[i].cat.reorder_categories(r, ordered=True)

    df['finance'] = raw['finance']
    return df

Пример #5

Показать файл

Файл: test_analytics.py Проект: tnir/pandas

    def test_unique(self, ordered):
        # GH38140
        dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)

        # categories are reordered based on value when ordered=False
        cat = Categorical(["a", "b", "c"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, cat)

        cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))

        cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)

        # nan must be removed
        cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)

Пример #6

Показать файл

Файл: utils.py Проект: dmalt/prodroma

def load_dframe(sujb_num):
    fname = "prodroma.xlsx"
    subj_num = 0
    dframe = pd.read_excel(fname,
                           sheet_name=subj_num,
                           skiprows=10,
                           usecols="B:CR",
                           index_col=0).T

    dframe = dframe.rename(
        columns={
            "день": "day",
            "Время заполнения ТП": "fillin_time",
            "ГБ новая": "ha_new",
            "ГБ продолжение": "ha_cont",
            "Начало боли": "ha_start",
            "Окончание боли": "ha_stop",
            "Обезболивающее": "painkiller",
            "Название": "painkiller_name",
            "аура": "aura",
            "Боль сейчас": "ha_now",
            "ВАШ макс": "your_max",
            "односторонняя": "onesided",
            "пульсация": "pulsation",
            "усиление движением": "intens_by_mov",
            "тошнота": "vomiting",
            "чувствительность к свету": "light_sens_bin",
            "чувствительность к звуку": "noise_sens_bin",
            "чувствительность к запахам": "smell_sens_bin",
            "заметил провокатор": "noticed_trigger",
            "какой триггер": "which_trigger",
            "Продолжительность сна": "sleep_duration",
            "Качество сна": "sleep_quality",
            "Свежесть после сна": "sleep_freshness",
            "Больше света, чем обычно": "a_lot_light",
            "Чувствительность к свету": "light_sens_cat",
            "Больше звука чем обычно": "a_lot_noise",
            "Чувствительность к звуку": "noise_sens_cat",
            "Были резкие запахи?": "strong_smells",
            "Чувствительность к запахам": "smell_sens_cat",
            "Пропуск приема пищи": "meal_skip",
            "Чувство голода": "hunger",
            "Воды достаточно?": "hydration",
            "Жажда": "thirst",
            "Алкоголь": "alcohol",
            "кофеин": "caffeine",
            "сыр, шоко, цитрус": "cheese_choco_citrus",
            "Хотелось шоколада": "wanted_choco",
            "Чувство усталости": "tiredness",
            "Сложность концентрации": "focus_difficulty",
            "Тревога": "anxiety",
            "Депрессия": "depression",
            "Работоспособность": "productivity",
            "Работосособность": "productivity",
            "Сонливость": "sleepiness",
            "Зевания": "yawning",
            "Напряжение глаз": "eye_strain",
            "боль в шее": "neck_pain",
            "Чувствит кожи головы": "scalp_sens",
            "Физическая ативность": "exercise",
            "какой день": "which_day",
            "Перелеты": "flights",
            "1 день менструации": "pms_1st_day",
            "подташнивает": "nausea",
            "вегетатика": "vegetatics",
            "мочеиспускание": "urination",
            "% заполнения дневника": "journal_completion_percentage",
            "комментарий": "comment",
            "дата": "date",
            "ТП": "TP",
        })

    dframe = dframe.set_index(["date", "TP"])

    dframe.columns.rename(None, inplace=True)

    dframe.fillin_time = pd.to_datetime(dframe.fillin_time)

    dframe.replace(to_replace="да", value=True, inplace=True)
    dframe.replace(to_replace="нет", value=False, inplace=True)

    dframe["ha_new"] = dframe["ha_new"].fillna(False)
    dframe["ha_cont"] = dframe["ha_cont"].fillna(False)
    dframe["ha_now"] = dframe["ha_new"] | dframe["ha_cont"]

    dframe["painkiller"] = dframe["painkiller"].fillna(False)
    dframe["vomiting"] = dframe["vomiting"].fillna(False)
    dframe["intens_by_mov"] = dframe["intens_by_mov"].fillna(False)
    dframe["pulsation"] = dframe["pulsation"].fillna(False)
    dframe["light_sens_bin"] = dframe["light_sens_bin"].fillna(False)
    dframe["noise_sens_bin"] = dframe["noise_sens_bin"].fillna(False)
    dframe["smell_sens_bin"] = dframe["smell_sens_bin"].fillna(False)
    dframe["flights"] = dframe["flights"].fillna(False)
    dframe["pms_1st_day"] = dframe["pms_1st_day"].fillna(False)

    cat_type = CategoricalDtype([1, 2, 3, 4, 5], ordered=True)
    for col in [
            "anxiety",
            "depression",
            "tiredness",
            "productivity",
            "sleepiness",
            "light_sens_cat",
            "smell_sens_cat",
            "noise_sens_cat",
            "sleep_quality",
            "sleep_freshness",
            "hunger",
    ]:
        dframe[col] = dframe[col].astype(cat_type)
    # dframe["anxiety"] = dframe["depression"].astype(int).astype('category')
    return dframe

Пример #7

Показать файл

    def test_unique(self, data, categories, expected_data, ordered):
        dtype = CategoricalDtype(categories, ordered=ordered)

        idx = CategoricalIndex(data, dtype=dtype)
        expected = CategoricalIndex(expected_data, dtype=dtype)
        tm.assert_index_equal(idx.unique(), expected)

Пример #8

Показать файл

                  columns=list('ABCDEF'))
initMetadata(df)
MARKERS = ['hex', 'circle_x', 'triangle', 'square']
markerFactor = factor_mark('DDC', MARKERS, ["A0", "A1", "A2", "A3", "A4"])
colorFactor = factor_cmap('DDC', 'Category10_6',
                          ["A0", "A1", "A2", "A3", "A4"])

mapDDC = {0: "A0", 1: "A1", 2: "A2", 3: "A3", 4: "A4"}
df.eval("Bool=A>0.5", inplace=True)
df.eval("BoolB=B>0.5", inplace=True)
df.eval("BoolC=C>0.1", inplace=True)
df["A"] = df["A"].round(3)
df["B"] = df["B"].round(3)
df["C"] = df["C"].round(3)
df["D"] = df["D"].round(3)
df["AA"] = ((df.A * 10).round(0)).astype(CategoricalDtype(ordered=True))
df["CC"] = ((df.C * 5).round(0)).astype(int)
df["DD"] = ((df.D * 4).round(0)).astype(int)
df["DDC"] = ((df.D * 4).round(0)).astype(int).map(mapDDC)
df["EE"] = (df.E * 4).round(0)
df['errY'] = df.A * 0.02 + 0.02
df.head(10)
df.meta.metaData = {
    'A.AxisTitle': "A (cm)",
    'B.AxisTitle': "B (cm/s)",
    'C.AxisTitle': "C (s)",
    'D.AxisTitle': "D (a.u.)",
    'Bool.AxisTitle': "A>half",
    'E.AxisTitle': "Category"
}

Пример #9

Показать файл

Файл: aliTreePlayer.py Проект: janthamade/RootInteractive

def tree2Panda(tree, include, selection, **kwargs):
    r"""
    Convert selected items from the tree into panda table
    TODO:
        * to  consult with uproot
            * currently not able to work with friend trees
        * check the latest version of RDeatFrame (in AliRoot latest v16.16.00)
        * Add filter on metadata - e.g class of variables
    :param tree:            input tree
    :param include:         regular expresion array - processing Tree+Friends, branches, aliases
    :param selection:       tree selection ()
    :param kwargs:
        * exclude           exclude arrray
        * firstEntry        firt entry to enter
        * nEntries          number of entries to convert
        * column mask
    :return:                panda data frame
    """
    options = {
        "exclude": [],
        "firstEntry": 0,
        "nEntries": 100000000,
        "columnMask": [[".fX$", "_X"], [".fY$", "_y"], [".fElements", ""]],
        "category":0,
        "verbose": 0
    }
    options.update(kwargs)
    if not hasattr(tree, 'anyTree'):
        treeToAnyTree(tree)          # expand tree/aliases/variables - if not done before
    anyTree = tree.anyTree
    # check regular expressions in anyTree
    variablesTree = findSelectedBranches(anyTree, include, options["exclude"])
    variables = ""

    for var in variablesTree:
        # if var.length<2: continue
        var = var.replace("/", ".")
        variables += var + ":"
    # check if valid TTree formula
    for var in include:
        if ".*" in var:
            continue
        formula=    ROOT.TTreeFormula('test', var, tree)
        if (formula.GetNdim()>0):
            variables += var + ":"
    variables = variables[0:-1]

    entries = tree.Draw(str(variables), selection, "goffpara", options["nEntries"], options["firstEntry"])  # query data
    columns = variables.split(":")
    for i, column in enumerate(columns):
        columns[i] = column.replace(".", "_")
    # replace column names
    #    1.) pandas does not allow dots in names
    #    2.) user can specified own column mask
    for i, column in enumerate(columns):
        for mask in options["columnMask"]:
            columns[i] = columns[i].replace(mask[0], mask[1])

    ex_dict = {}
    for i, a in enumerate(columns):
        val = tree.GetVal(i)
        ex_dict[a] = np.frombuffer(val, dtype=float, count=entries)
    df = pd.DataFrame(ex_dict, columns=columns)
    for i, a in enumerate(columns):
        if (tree.GetLeaf(a)):
              if (tree.GetLeaf(a).ClassName() == 'TLeafC'): df[a]=df[a].astype(np.int8)
              if (tree.GetLeaf(a).ClassName() == 'TLeafS'): df[a]=df[a].astype(np.int16)
              if (tree.GetLeaf(a).ClassName() == 'TLeafI'): df[a]=df[a].astype(np.int32)
              if (tree.GetLeaf(a).ClassName() == 'TLeafL'): df[a]=df[a].astype(np.int64)
              if (tree.GetLeaf(a).ClassName() == 'TLeafB'): df[a] = df[a].astype(bool)
        if (options["category"]>0):
            dfUniq=df[a].unique()
            if dfUniq.shape[0]<=options["category"] :
                df[a]=df[a].astype(CategoricalDtype(ordered=True))



    initMetadata(df)
    metaData = tree.GetUserInfo().FindObject("metaTable")
    if metaData:
        for key in metaData:
            df.meta.metaData[key.GetName()] = key.GetTitle()
    return df

Пример #10

Показать файл

Файл: test_getitem.py Проект: tnir/pandas

    def test_getitem_bool_mask_categorical_index(self):

        df3 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=True),
                name="B",
            ),
        )
        df4 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=False),
                name="B",
            ),
        )

        result = df3[df3.index == "a"]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == "a"]
        expected = df4.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index == 1]
        expected = df3.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == 1]
        expected = df4.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # since we have an ordered categorical

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=True,
        #         name='B')
        result = df3[df3.index < 2]
        expected = df3.iloc[[4]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index > 1]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        # unordered
        # cannot be compared

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=False,
        #         name='B')
        msg = "Unordered Categoricals can only compare equality or not"
        with pytest.raises(TypeError, match=msg):
            df4[df4.index < 2]
        with pytest.raises(TypeError, match=msg):
            df4[df4.index > 1]

Пример #11

Показать файл

Файл: extra.py Проект: pydemic/mundi

from . import loader

REGIONS = {
    "north-america": "North America",
    "south-asia": "South Asia",
    "sub-saharan-africa": "Sub-Saharan Africa",
    "europe": "Europe & Central Asia",
    "latin-america": "Latin America & Caribbean",
    "middle-east": "Middle East & North Africa",
    "east-asia": "East Asia & Pacific",
}
INCOME_GROUPS = {
    "low": "Low income",
    "lower-middle": "Lower middle income",
    "upper-middle": "Upper middle income",
    "high": "High income",
}

IncomeGroup = CategoricalDtype(categories=INCOME_GROUPS, ordered=True)
Region = CategoricalDtype(categories=REGIONS, ordered=False)


@loader.filtering_from_data(["region"])
def load_region():
    return loader.load_database("un.pkl.gz").astype(Region)


@loader.filtering_from_data(["income_group"])
def load_income_group():
    return loader.load_database("un.pkl.gz").astype(IncomeGroup)

Пример #12

Показать файл

Файл: test_update.py Проект: zoehuang7/pandas

class TestUpdate:
    def test_update(self):
        s = Series([1.5, np.nan, 3.0, 4.0, np.nan])
        s2 = Series([np.nan, 3.5, np.nan, 5.0])
        s.update(s2)

        expected = Series([1.5, 3.5, 3.0, 5.0, np.nan])
        tm.assert_series_equal(s, expected)

        # GH 3217
        df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
        df["c"] = np.nan

        df["c"].update(Series(["foo"], index=[0]))
        expected = DataFrame(
            [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"]
        )
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "other, dtype, expected",
        [
            # other is int
            ([61, 63], "int32", Series([10, 61, 12], dtype="int32")),
            ([61, 63], "int64", Series([10, 61, 12])),
            ([61, 63], float, Series([10.0, 61.0, 12.0])),
            ([61, 63], object, Series([10, 61, 12], dtype=object)),
            # other is float, but can be cast to int
            ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32")),
            ([61.0, 63.0], "int64", Series([10, 61, 12])),
            ([61.0, 63.0], float, Series([10.0, 61.0, 12.0])),
            ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object)),
            # others is float, cannot be cast to int
            ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], float, Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object)),
            # other is object, cannot be cast
            ([(61,), (63,)], "int32", Series([10, (61,), 12])),
            ([(61,), (63,)], "int64", Series([10, (61,), 12])),
            ([(61,), (63,)], float, Series([10.0, (61,), 12.0])),
            ([(61,), (63,)], object, Series([10, (61,), 12])),
        ],
    )
    def test_update_dtypes(self, other, dtype, expected):

        ser = Series([10, 11, 12], dtype=dtype)
        other = Series(other, index=[1, 3])
        ser.update(other)

        tm.assert_series_equal(ser, expected)

    @pytest.mark.parametrize(
        "series, other, expected",
        [
            # update by key
            (
                Series({"a": 1, "b": 2, "c": 3, "d": 4}),
                {"b": 5, "c": np.nan},
                Series({"a": 1, "b": 5, "c": 3, "d": 4}),
            ),
            # update by position
            (Series([1, 2, 3, 4]), [np.nan, 5, 1], Series([1, 5, 1, 4])),
        ],
    )
    def test_update_from_non_series(self, series, other, expected):
        # GH 33215
        series.update(other)
        tm.assert_series_equal(series, expected)

    @pytest.mark.parametrize(
        "data, other, expected, dtype",
        [
            (["a", None], [None, "b"], ["a", "b"], "string"),
            pytest.param(
                ["a", None],
                [None, "b"],
                ["a", "b"],
                "arrow_string",
                marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
            ),
            ([1, None], [None, 2], [1, 2], "Int64"),
            ([True, None], [None, False], [True, False], "boolean"),
            (
                ["a", None],
                [None, "b"],
                ["a", "b"],
                CategoricalDtype(categories=["a", "b"]),
            ),
            (
                [Timestamp(year=2020, month=1, day=1, tz="Europe/London"), NaT],
                [NaT, Timestamp(year=2020, month=1, day=1, tz="Europe/London")],
                [Timestamp(year=2020, month=1, day=1, tz="Europe/London")] * 2,
                "datetime64[ns, Europe/London]",
            ),
        ],
    )
    def test_update_extension_array_series(self, data, other, expected, dtype):
        result = Series(data, dtype=dtype)
        other = Series(other, dtype=dtype)
        expected = Series(expected, dtype=dtype)

        result.update(other)
        tm.assert_series_equal(result, expected)

    def test_update_with_categorical_type(self):
        # GH 25744
        dtype = CategoricalDtype(["a", "b", "c", "d"])
        s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype)
        s2 = Series(["b", "a"], index=[1, 2], dtype=dtype)
        s1.update(s2)
        result = s1
        expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype)
        tm.assert_series_equal(result, expected)

Пример #13

Показать файл

Файл: wiki_ampligraph.py Проект: c0ntradicti0n/KnowledgeScience

    if kind == "kn":
        subkind = "tsne"
    else:
        sub_kind = kind

    subset = cl_df[[c + "_" + sub_kind for c in ['x', 'y', 'z']]]
    print(subset[:10])

    points = [list(x) for x in subset.to_numpy()]
    print(points[:10])
    print(len(points))

    arr = np.array(points)
    dist = Y = cdist(arr, arr, 'euclidean')
    new_path = make_path(np.array(points), dist)[:-1]
    print(new_path)

    cl_df[['cl_%s' % k for k in things]] = cl_cols

    path_order_categories = CategoricalDtype(categories=new_path, ordered=True)
    cl_df['cl_%s' % kind] = cl_df['cl'].astype(path_order_categories)

    cl_df.sort_values(['cl_%s' % kind], inplace=True)
    cl_df['cl_%s' % kind] = cl_df['cl'].astype('int32')

    cl_df.to_csv('%s_clusters_mean_points.csv' % kind,
                 sep='\t',
                 header=True,
                 index=False)
    print(kind + " " + str(new_path))

Пример #14

Показать файл

Файл: test_transpose.py Проект: Aathi410/Pro123


def test_numpy_transpose(index_or_series_obj):
    msg = "the 'axes' parameter is not supported"
    obj = index_or_series_obj
    tm.assert_equal(np.transpose(obj), obj)

    with pytest.raises(ValueError, match=msg):
        np.transpose(obj, axes=1)


@pytest.mark.parametrize(
    "data, transposed_data, index, columns, dtype",
    [
        ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
        ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
        ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
        ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
        ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
        (
            [[1, 2], [3, 4]],
            [[1, 3], [2, 4]],
            ["a", "a"],
            ["b", "b"],
            CategoricalDtype([1, 2, 3, 4]),
        ),
    ],
)
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
    # GH 42380
    df = DataFrame(data, index=index, columns=columns, dtype=dtype)

Пример #15

Показать файл

Файл: test_london_scraper_output.py Проект: michaelwalshe/scrape_london_marathon

def test_output_attributes(scraper_output):
    results = scraper_output
    exp_cols = [
        "Place (Overall)",
        "Place (Gender)",
        "Place (Category)",
        "Name",
        "Sex",
        "Club",
        "Running Number",
        "Category",
        "Finish",
        "Year",
        "Country",
        "FirstName",
        "LastName",
        "DSQ",
        "Finish (Total Seconds)",
    ]

    exp_dtypes = pd.Series({
        "Place (Overall)":
        Int64Dtype(),
        "Place (Gender)":
        Int64Dtype(),
        "Place (Category)":
        dtype("float64"),
        "Name":
        dtype("O"),
        "Sex":
        dtype("O"),
        "Club":
        dtype("O"),
        "Running Number":
        dtype("O"),
        "Category":
        CategoricalDtype(
            categories=[
                "18-39",
                "40-44",
                "45-49",
                "50-54",
                "55-59",
                "60-64",
                "65-69",
                "70+",
                "70-74",
                "75-79",
                "80-84",
                "85+",
                "80+",
                "Unknown",
            ],
            ordered=False,
        ),
        "Finish":
        dtype("<m8[ns]"),
        "Year":
        Int64Dtype(),
        "Country":
        dtype("O"),
        "FirstName":
        dtype("O"),
        "LastName":
        dtype("O"),
        "DSQ":
        dtype("bool"),
        "Finish (Total Seconds)":
        dtype("float64"),
    })

    exp_rows_min = 1000  # One sex for one year should give at least this many

    assert exp_cols == list(results.columns), "Expected columns not found"
    assert exp_rows_min <= results.shape[
        0], "Less than minimum expected number of rows"

    assert exp_dtypes.values.tolist() == results.dtypes.values.tolist()

Пример #16

Показать файл

import numpy as np
import pandas as pd
from pandas import CategoricalDtype

df = pd.read_csv('D:\\Study\\ML\\Final_Project\\dataset-har-PUC-Rio-ugulino\\Full_Data.csv', delimiter=';')
df['how_tall_in_meters'] = df['how_tall_in_meters'].apply(lambda x: int(x.replace(',', '')))
df['body_mass_index'] = df['body_mass_index'].apply(lambda x: float(x.replace(',', '.')))

df["user"] = df["user"].astype(CategoricalDtype(['debora', 'katia', 'wallace', 'jose_carlos']))
df = pd.concat([df, pd.get_dummies(df['user'], prefix='user')], axis=1)

df["gender"] = df["gender"].astype(CategoricalDtype(['Woman', 'Man']))
df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')], axis=1)

df["class"] = df["class"].astype(CategoricalDtype(['sitting', 'sittingdown', 'standing', 'standingup', 'walking']))
df = pd.concat([df, pd.get_dummies(df['class'], prefix='class')], axis=1)

df.drop(['user'], axis=1, inplace=True)
df.drop(['gender'], axis=1, inplace=True)
df.drop(['class'], axis=1, inplace=True)

array = df.to_numpy()
np.random.shuffle(array)
train_data = array[:int(len(array) * 0.8)]
test_data = array[int(len(array) * 0.8):]
pd.DataFrame(train_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Train_data.csv", header=df.columns,
                                index=False)
pd.DataFrame(test_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Test_data.csv", header=df.columns,
                               index=False)

Пример #17

Показать файл

Файл: create_data.py Проект: marcdumon/caribbean_challenge

def preprocess_features(fp_processed, only_label=True):
    # Load and merge the datasets
    train = pd.read_csv(fp_processed + 'train.csv', index_col=0)
    valid = pd.read_csv(fp_processed + 'valid.csv', index_col=0)
    test = pd.read_csv(fp_processed + 'test.csv', index_col=0)
    #   For easier splitting afterwards
    train['dataset'] = 'train'
    valid['dataset'] = 'valid'
    test['dataset'] = 'test'
    tvt = pd.concat([train, valid, test])

    labels = [
        'concrete_cement', 'healthy_metal', 'incomplete', 'irregular_metal',
        'other'
    ]
    countries = ['colombia', 'guatemala', 'st_lucia']
    places = [
        'borde_rural', 'borde_soacha', 'castries', 'dennery', 'gros_islet',
        'mixco_1_and_ebenezer', 'mixco_3'
    ]

    countries_cat_type = CategoricalDtype(categories=countries, ordered=True)
    places_cat_type = CategoricalDtype(categories=places, ordered=True)
    labels_cat_type = CategoricalDtype(
        categories=labels + ['unknown'],
        ordered=True)  # +['unknown] for the nan's in neighbour labels

    # Encode labels
    tvt.loc[:, 'label'] = tvt.loc[:, 'label'].astype(labels_cat_type).cat.codes

    if not only_label:
        # Encode categories
        #   First handle nan, otherwise cat.code for nan is -1, resulting in error in ebedding (index out of range: -1)
        tvt = tvt.fillna('unknown')

        tvt.loc[:, 'country'] = tvt.loc[:, 'country'].astype(str).astype(
            countries_cat_type).cat.codes
        tvt.loc[:,
                'place'] = tvt.loc[:,
                                   'place'].astype(places_cat_type).cat.codes
        tvt.loc[:, 'verified'] = tvt.loc[:, 'verified'].astype(int)
        for i in range(1, 21):
            tvt.loc[:, f'l_{i}'] = tvt.loc[:, f'l_{i}'].astype(
                labels_cat_type).cat.codes

        # Normalize continuous features
        continuous_cols = [
            'area', 'complexity', 'z_min', 'z_max', 'z_median', 'z_count',
            'z_majority', 'z_minority', 'z_unique', 'z_range', 'z_sum'
        ]
        for col in continuous_cols:
            mu = tvt[col].mean()
            sigma = tvt[col].std()
            tvt.loc[:, col] = (tvt[col] - mu) / sigma

        # Normalize distances
        mu = tvt.loc[:, 'd_1':'d_19'].values.mean()
        sigma = tvt.loc[:, 'd_1':'d_19'].values.std()
        for i in range(1, 21):
            tvt.loc[:, f'd_{i}'] = (tvt[f'd_{i}'] - mu) / sigma

    # split and save
    train = tvt[tvt['dataset'] == 'train']
    valid = tvt[tvt['dataset'] == 'valid']
    test = tvt[tvt['dataset'] == 'test']
    train.to_csv(fp_processed + 'train_.csv')
    valid.to_csv(fp_processed + 'valid.csv')
    test.to_csv(fp_processed + 'test.csv')

Пример #18

Показать файл

Файл: test_io.py Проект: vishalbelsare/hpat

 def test_impl():
     names = ['C1', 'C2', 'C3']
     ct_dtype = CategoricalDtype(['A', 'B', 'C'])
     dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str}
     df = pd.read_csv("csv_data_cat1.csv", names=names, dtype=dtypes)
     return df

Пример #19

Показать файл

class TestDataFrameToRecords:
    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )

        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result

    def test_to_records_dt64tz_column(self):
        # GH#32535 dont less tz in to_records
        df = DataFrame(
            {"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")})

        result = df.to_records()

        assert result.dtype["A"] == object
        val = result[0][1]
        assert isinstance(val, Timestamp)
        assert val == df.loc[0, "A"]

    def test_to_records_with_multindex(self):
        # GH#3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr("From: <*****@*****.**>\n"
                                    "To: <*****@*****.**>\n"
                                    "Subject: Test message\n"
                                    "\n"
                                    "Body would go here\n")

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields

        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        rs = df.to_records()
        assert "level_0" in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH#13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue GH#11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={
                "names": ["index", "accented_name_é"],
                "formats": ["=i8", "=f8"]
            },
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):
        # GH#8626

        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)

        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")],
                                dtype=[("index", "=i8"), ("0", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                dict(),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Should have no effect in this case.
            (
                dict(index=True),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                dict(column_dtypes="<U4"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"),
                           ("C", "<U4")],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                dict(index_dtypes="<U1"),
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Pass in a type instance.
            (
                dict(column_dtypes=np.unicode),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dtype instance.
            (
                dict(column_dtypes=np.dtype("unicode")),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32,
                    "C": "<U2"
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "<U2")],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                dict(index_dtypes={0: "int16"}),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                dict(index=False, index_dtypes="<U2"),
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                dict(index_dtypes={
                    0: "int16",
                    "not-there": "float32"
                }),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.dtype("int8"),
                    "B": np.dtype("float32")
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Mixture of everything.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                },
                     index_dtypes="<U2"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Invalid dype values.
            (
                dict(index=False, column_dtypes=list()),
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": 5
                }),
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                dict(
                    index=False,
                    column_dtypes={
                        "A": "int32",
                        "B": CategoricalDtype(["a", "b"])
                    },
                ),
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": "foo"
                }),
                (TypeError, "data type [\"']foo[\"'] not understood"),
            ),
        ],
    )
    def test_to_records_dtype(self, kwargs, expected):
        # see GH#18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=list("abc")).set_index(["a", "b"]),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "int32",
                         1: "int8"
                     }),
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")]),
                ),
                dict(column_dtypes={
                    0: "<U1",
                    2: "float32"
                },
                     index_dtypes="float32"),
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0),
                     (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", "<f4"),
                        ("('a', 'd')", "<U1"),
                        ("('b', 'e')", "<i8"),
                        ("('c', 'f')", "<f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")],
                                                   names=list("ab")),
                    index=MultiIndex.from_tuples([("d", -4), ("d", -5),
                                                  ("f", -6)],
                                                 names=list("cd")),
                ),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "<U2",
                         1: "int8"
                     }),
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", "<U2"),
                        ("d", "i1"),
                        ("('a', 'd')", "<f8"),
                        ("('b', 'e')", "<f8"),
                        ("('c', 'f')", "<f8"),
                    ],
                ),
            ),
        ],
    )
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see GH#18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see GH#18146
        class DictLike:
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key) -> bool:
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(
            column_dtypes=DictLike(**{
                "A": np.int8,
                "B": np.float32
            }),
            index_dtypes="<U2",
        )

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
        )
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH#13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)

        df = DataFrame({"datetime": dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

Пример #20

Показать файл

class TestDataFrameConvertTo(TestData):

    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp('20130101')
        test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
        test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})

        expected_records = [{'A': tsmp, 'B': tsmp},
                            {'A': tsmp, 'B': tsmp}]
        expected_records_mixed = [{'A': tsmp, 'B': 1},
                                  {'A': tsmp, 'B': 2}]

        assert (test_data.to_dict(orient='records') ==
                expected_records)
        assert (test_data_mixed.to_dict(orient='records') ==
                expected_records_mixed)

        expected_series = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([tsmp, tsmp], name='B'),
        }
        expected_series_mixed = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([1, 2], name='B'),
        }

        tm.assert_dict_equal(test_data.to_dict(orient='series'),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
                             expected_series_mixed)

        expected_split = {
            'index': [0, 1],
            'data': [[tsmp, tsmp],
                     [tsmp, tsmp]],
            'columns': ['A', 'B']
        }
        expected_split_mixed = {
            'index': [0, 1],
            'data': [[tsmp, 1],
                     [tsmp, 2]],
            'columns': ['A', 'B']
        }

        tm.assert_dict_equal(test_data.to_dict(orient='split'),
                             expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
                             expected_split_mixed)

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient='index')

    def test_to_dict_invalid_orient(self):
        df = DataFrame({'A': [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient='xinvalid')

    def test_to_records_dt64(self):
        df = DataFrame([["one", "two", "three"],
                        ["four", "five", "six"]],
                       index=date_range("2012-01-01", "2012-01-02"))

        # convert_datetime64 defaults to None
        expected = df.index.values[0]
        result = df.to_records()['index'][0]
        assert expected == result

        # check for FutureWarning if convert_datetime64=False is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index.values[0]
            result = df.to_records(convert_datetime64=False)['index'][0]
            assert expected == result

        # check for FutureWarning if convert_datetime64=True is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index[0]
            result = df.to_records(convert_datetime64=True)['index'][0]
            assert expected == result

    def test_to_records_with_multindex(self):
        # GH3189
        index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)['level_0']
        assert 'bar' in r
        assert 'one' not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr('From: <*****@*****.**>\n'
                                    'To: <*****@*****.**>\n'
                                    'Subject: Test message\n'
                                    '\n'
                                    'Body would go here\n')

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ['Type', 'Subject', 'From'])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = 'X'
        rs = df.to_records()
        assert 'X' in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert 'index' in rs.dtype.fields

        df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
        df.index.names = ['A', None]
        rs = df.to_records()
        assert 'level_0' in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH13172
        # unicode_literals conflict with to_records
        result = DataFrame([{'a': 'x', 'b': 'y'}]).set_index('a') \
            .to_records()
        expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue #11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={"names": ["index", "accented_name_é"],
                   "formats": ['=i8', '=f8']}
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):

        # GH8626

        # dict creation
        df = DataFrame({'A': list('abc')}, dtype='category')
        expected = Series(list('abc'), dtype='category', name='A')
        tm.assert_series_equal(df['A'], expected)

        # list-like creation
        df = DataFrame(list('abc'), dtype='category')
        expected = Series(list('abc'), dtype='category', name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
                                dtype=[('index', '=i8'), ('0', 'O')])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("kwargs,expected", [
        # No dtypes --> default to array dtypes.
        (dict(),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "<i8"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Should have no effect in this case.
        (dict(index=True),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "<i8"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Column dtype applied across the board. Index unaffected.
        (dict(column_dtypes="<U4"),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U4"),
                             ("B", "<U4"), ("C", "<U4")])),

        # Index dtype applied across the board. Columns unaffected.
        (dict(index_dtypes="<U1"),
         np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                      dtype=[("index", "<U1"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Pass in a type instance.
        (dict(column_dtypes=np.unicode),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U"),
                             ("B", "<U"), ("C", "<U")])),

        # Pass in a dtype instance.
        (dict(column_dtypes=np.dtype('unicode')),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U"),
                             ("B", "<U"), ("C", "<U")])),

        # Pass in a dictionary (name-only).
        (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "<U2")])),

        # Pass in a dictionary (indices-only).
        (dict(index_dtypes={0: "int16"}),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "i2"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Ignore index mappings if index is not True.
        (dict(index=False, index_dtypes="<U2"),
         np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
                      dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),

        # Non-existent names / indices in mapping should not error.
        (dict(index_dtypes={0: "int16", "not-there": "float32"}),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "i2"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Names / indices not in mapping default to array dtype.
        (dict(column_dtypes={"A": np.int8, "B": np.float32}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Names / indices not in dtype mapping default to array dtype.
        (dict(column_dtypes={"A": np.dtype('int8'), "B": np.dtype('float32')}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Mixture of everything.
        (dict(column_dtypes={"A": np.int8, "B": np.float32},
              index_dtypes="<U2"),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<U2"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Invalid dype values.
        (dict(index=False, column_dtypes=list()),
         (ValueError, "Invalid dtype \\[\\] specified for column A")),

        (dict(index=False, column_dtypes={"A": "int32", "B": 5}),
         (ValueError, "Invalid dtype 5 specified for column B")),

        # Numpy can't handle EA types, so check error is raised
        (dict(index=False, column_dtypes={"A": "int32",
                                          "B": CategoricalDtype(['a', 'b'])}),
         (ValueError, 'Invalid dtype category specified for column B')),

        # Check that bad types raise
        (dict(index=False, column_dtypes={"A": "int32", "B": "foo"}),
         (TypeError, 'data type "foo" not understood')),
    ])
    def test_to_records_dtype(self, kwargs, expected):
        # see gh-18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("df,kwargs,expected", [
        # MultiIndex in the index.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=list("abc")).set_index(["a", "b"]),
         dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
         np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
                      dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),

        # MultiIndex in the columns.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                   ("c", "f")])),
         dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
         np.rec.array([(0., "1", 2, 3.), (1., "4", 5, 6.),
                       (2., "7", 8, 9.)],
                      dtype=[("index", "<f4"),
                             ("('a', 'd')", "<U1"),
                             ("('b', 'e')", "<i8"),
                             ("('c', 'f')", "<f4")])),

        # MultiIndex in both the columns and index.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=MultiIndex.from_tuples([
                       ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
                   index=MultiIndex.from_tuples([
                       ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
         dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
         np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
                       ("f", -6, 7, 8, 9.)],
                      dtype=[("c", "<U2"), ("d", "i1"),
                             ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
                             ("('c', 'f')", "<f8")]))
    ])
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see gh-18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see gh-18146
        class DictLike(object):
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key):
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
                                                        "B": np.float32}),
                              index_dtypes="<U2")

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array([("0", "1", "0.2", "a"),
                                 ("1", "2", "1.5", "bc")],
                                dtype=[("index", "<U2"), ("A", "i1"),
                                       ("B", "<f4"), ("C", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize('mapping', [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        test_data = {
            'A': {'1': 1, '2': 2},
            'B': {'1': '1', '2': '2', '3': '3'},
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [{'A': 1.0, 'B': '1'},
                            {'A': 2.0, 'B': '2'},
                            {'A': np.nan, 'B': '3'}]
        assert isinstance(recons_data, list)
        assert (len(recons_data) == 3)
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

        df = DataFrame(test_data)
        df['duped'] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data['duped'] = comp_data[df.columns[0]]
        for k, v in compat.iteritems(comp_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

    @pytest.mark.parametrize('mapping', [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH16122
        df = DataFrame(np.random.randn(3, 3))
        with pytest.raises(TypeError):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH13937
        dr = date_range('2016-01-01', periods=10,
                        freq='S', tz=tz)

        df = DataFrame({'datetime': dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize('orient,item_getter', [
        ('dict', lambda d, col, idx: d[col][idx]),
        ('records', lambda d, col, idx: d[idx][col]),
        ('list', lambda d, col, idx: d[col][idx]),
        ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]),
        ('index', lambda d, col, idx: d[idx][col])
    ])
    def test_to_dict_box_scalars(self, orient, item_getter):
        # 14216, 23753
        # make sure that we are boxing properly
        df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, 'a', 0), int)
        assert isinstance(item_getter(result, 'b', 0), float)

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
                (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
        df = DataFrame(list(data), columns=["d", ])

        result = df.to_dict(orient='records')
        expected = [
            {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
            {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize('into, expected', [
        (dict, {0: {'int_col': 1, 'float_col': 1.0},
                1: {'int_col': 2, 'float_col': 2.0},
                2: {'int_col': 3, 'float_col': 3.0}}),
        (OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
                                   (1, {'int_col': 2, 'float_col': 2.0}),
                                   (2, {'int_col': 3, 'float_col': 3.0})])),
        (defaultdict(list), defaultdict(list,
                                        {0: {'int_col': 1, 'float_col': 1.0},
                                         1: {'int_col': 2, 'float_col': 2.0},
                                         2: {'int_col': 3, 'float_col': 3.0}}))
    ])
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({'int_col': [1, 2, 3],
                        'float_col': [1.0, 2.0, 3.0]})

        result = df.to_dict(orient='index', into=into)
        cols = ['int_col', 'float_col']
        result = DataFrame.from_dict(result, orient='index')[cols]
        expected = DataFrame.from_dict(expected, orient='index')[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # https://github.com/pandas-dev/pandas/issues/24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict('records')[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # https://github.com/pandas-dev/pandas/issues/24939
        df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)})
        result = df.to_dict('records')[0]
        expected = {'A_{:d}'.format(i): i for i in range(256)}
        assert result == expected

Пример #21

Показать файл

Файл: test_convert_to.py Проект: raghavgai/mypandas

class TestDataFrameConvertTo:
    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp("20130101")
        test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
        test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})

        expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
        expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]

        assert test_data.to_dict(orient="records") == expected_records
        assert test_data_mixed.to_dict(
            orient="records") == expected_records_mixed

        expected_series = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([tsmp, tsmp], name="B"),
        }
        expected_series_mixed = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([1, 2], name="B"),
        }

        tm.assert_dict_equal(test_data.to_dict(orient="series"),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="series"),
                             expected_series_mixed)

        expected_split = {
            "index": [0, 1],
            "data": [[tsmp, tsmp], [tsmp, tsmp]],
            "columns": ["A", "B"],
        }
        expected_split_mixed = {
            "index": [0, 1],
            "data": [[tsmp, 1], [tsmp, 2]],
            "columns": ["A", "B"],
        }

        tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="split"),
                             expected_split_mixed)

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="index")

    def test_to_dict_invalid_orient(self):
        df = DataFrame({"A": [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="xinvalid")

    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )

        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result

    def test_to_records_with_multindex(self):
        # GH3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr("From: <*****@*****.**>\n"
                                    "To: <*****@*****.**>\n"
                                    "Subject: Test message\n"
                                    "\n"
                                    "Body would go here\n")

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields

        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        rs = df.to_records()
        assert "level_0" in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue #11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={
                "names": ["index", "accented_name_é"],
                "formats": ["=i8", "=f8"]
            },
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):

        # GH8626

        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)

        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")],
                                dtype=[("index", "=i8"), ("0", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                dict(),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Should have no effect in this case.
            (
                dict(index=True),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                dict(column_dtypes="<U4"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"),
                           ("C", "<U4")],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                dict(index_dtypes="<U1"),
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Pass in a type instance.
            (
                dict(column_dtypes=np.unicode),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dtype instance.
            (
                dict(column_dtypes=np.dtype("unicode")),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32,
                    "C": "<U2"
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "<U2")],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                dict(index_dtypes={0: "int16"}),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                dict(index=False, index_dtypes="<U2"),
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                dict(index_dtypes={
                    0: "int16",
                    "not-there": "float32"
                }),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.dtype("int8"),
                    "B": np.dtype("float32")
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Mixture of everything.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                },
                     index_dtypes="<U2"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Invalid dype values.
            (
                dict(index=False, column_dtypes=list()),
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": 5
                }),
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                dict(
                    index=False,
                    column_dtypes={
                        "A": "int32",
                        "B": CategoricalDtype(["a", "b"])
                    },
                ),
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": "foo"
                }),
                (TypeError, 'data type "foo" not understood'),
            ),
        ],
    )
    def test_to_records_dtype(self, kwargs, expected):
        # see gh-18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=list("abc")).set_index(["a", "b"]),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "int32",
                         1: "int8"
                     }),
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")]),
                ),
                dict(column_dtypes={
                    0: "<U1",
                    2: "float32"
                },
                     index_dtypes="float32"),
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0),
                     (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", "<f4"),
                        ("('a', 'd')", "<U1"),
                        ("('b', 'e')", "<i8"),
                        ("('c', 'f')", "<f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")],
                                                   names=list("ab")),
                    index=MultiIndex.from_tuples([("d", -4), ("d", -5),
                                                  ("f", -6)],
                                                 names=list("cd")),
                ),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "<U2",
                         1: "int8"
                     }),
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", "<U2"),
                        ("d", "i1"),
                        ("('a', 'd')", "<f8"),
                        ("('b', 'e')", "<f8"),
                        ("('c', 'f')", "<f8"),
                    ],
                ),
            ),
        ],
    )
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see gh-18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see gh-18146
        class DictLike:
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key):
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(
            column_dtypes=DictLike(**{
                "A": np.int8,
                "B": np.float32
            }),
            index_dtypes="<U2",
        )

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
        )
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        test_data = {
            "A": {
                "1": 1,
                "2": 2
            },
            "B": {
                "1": "1",
                "2": "2",
                "3": "3"
            }
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][int(k2) - 1]

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {
            "columns": ["A", "B"],
            "index": ["1", "2", "3"],
            "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [
            {
                "A": 1.0,
                "B": "1"
            },
            {
                "A": 2.0,
                "B": "2"
            },
            {
                "A": np.nan,
                "B": "3"
            },
        ]
        assert isinstance(recons_data, list)
        assert len(recons_data) == 3
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

        df = DataFrame(test_data)
        df["duped"] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data["duped"] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

    @pytest.mark.parametrize("mapping", [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH16122
        df = DataFrame(np.random.randn(3, 3))
        with pytest.raises(TypeError):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)

        df = DataFrame({"datetime": dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize(
        "orient,item_getter",
        [
            ("dict", lambda d, col, idx: d[col][idx]),
            ("records", lambda d, col, idx: d[idx][col]),
            ("list", lambda d, col, idx: d[col][idx]),
            ("split",
             lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
            ("index", lambda d, col, idx: d[idx][col]),
        ],
    )
    def test_to_dict_box_scalars(self, orient, item_getter):
        # 14216, 23753
        # make sure that we are boxing properly
        df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, "a", 0), int)
        assert isinstance(item_getter(result, "b", 0), float)

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [
            (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc), ),
            (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc), ),
        ]
        df = DataFrame(list(data), columns=["d"])

        result = df.to_dict(orient="records")
        expected = [
            {
                "d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)
            },
            {
                "d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)
            },
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize(
        "into, expected",
        [
            (
                dict,
                {
                    0: {
                        "int_col": 1,
                        "float_col": 1.0
                    },
                    1: {
                        "int_col": 2,
                        "float_col": 2.0
                    },
                    2: {
                        "int_col": 3,
                        "float_col": 3.0
                    },
                },
            ),
            (
                OrderedDict,
                OrderedDict([
                    (0, {
                        "int_col": 1,
                        "float_col": 1.0
                    }),
                    (1, {
                        "int_col": 2,
                        "float_col": 2.0
                    }),
                    (2, {
                        "int_col": 3,
                        "float_col": 3.0
                    }),
                ]),
            ),
            (
                defaultdict(dict),
                defaultdict(
                    dict,
                    {
                        0: {
                            "int_col": 1,
                            "float_col": 1.0
                        },
                        1: {
                            "int_col": 2,
                            "float_col": 2.0
                        },
                        2: {
                            "int_col": 3,
                            "float_col": 3.0
                        },
                    },
                ),
            ),
        ],
    )
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})

        result = df.to_dict(orient="index", into=into)
        cols = ["int_col", "float_col"]
        result = DataFrame.from_dict(result, orient="index")[cols]
        expected = DataFrame.from_dict(expected, orient="index")[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # https://github.com/pandas-dev/pandas/issues/24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict("records")[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # https://github.com/pandas-dev/pandas/issues/24939
        df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
        result = df.to_dict("records")[0]
        expected = {"A_{:d}".format(i): i for i in range(256)}
        assert result == expected

    def test_to_dict_orient_dtype(self):
        # https://github.com/pandas-dev/pandas/issues/22620
        # Input Data
        input_data = {
            "a": [1, 2, 3],
            "b": [1.0, 2.0, 3.0],
            "c": ["X", "Y", "Z"]
        }
        df = DataFrame(input_data)
        # Expected Dtypes
        expected = {"a": int, "b": float, "c": str}
        # Extracting dtypes out of to_dict operation
        for df_dict in df.to_dict("records"):
            result = {
                "a": type(df_dict["a"]),
                "b": type(df_dict["b"]),
                "c": type(df_dict["c"]),
            }
            assert result == expected

Пример #22

Показать файл

def test_london_cleaner():
    unclean_input = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["»A Smith, Matthew (GBR) \n", "»Aalders, Jennifer (GBR) \n"],
            "Sex": ["M", "W"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["Runner Number40546", "Runner Number23235"],
            "Category": ["18-39", pd.NA],
            "Finish": ["0 days 03:59:33", "0 days 06:22:20"],
            "Year": [2021, 2021],
        }
    )

    exp_output = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["A Smith Matthew", "Aalders Jennifer"],
            "Sex": ["M", "F"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["40546", "23235"],
            "Category": ["18-39", "Unknown"],
            "Finish": [
                pd.Timedelta("0 days 03:59:33"),
                pd.Timedelta("0 days 06:22:20"),
            ],
            "Year": [2021, 2021],
            "Country": ["GBR", "GBR"],
            "FirstName": ["Matthew", "Jennifer"],
            "LastName": ["A Smith", "Aalders"],
            "DSQ": [False, False],
            "Finish (Total Seconds)": [14373.0, 22940.0],
        }
    ).astype(
        {
            "Place (Overall)": Int64Dtype(),
            "Place (Gender)": Int64Dtype(),
            "Place (Category)": Int64Dtype(),
            "Name": dtype("O"),
            "Sex": dtype("O"),
            "Club": dtype("O"),
            "Running Number": dtype("O"),
            "Category": CategoricalDtype(
                categories=[
                    "18-39",
                    "40-44",
                    "45-49",
                    "50-54",
                    "55-59",
                    "60-64",
                    "65-69",
                    "70+",
                    "70-74",
                    "75-79",
                    "80-84",
                    "85+",
                    "80+",
                    "Unknown",
                ],
                ordered=False,
            ),
            "Finish": dtype("<m8[ns]"),
            "Year": Int64Dtype(),
            "Country": dtype("O"),
            "FirstName": dtype("O"),
            "LastName": dtype("O"),
            "DSQ": dtype("bool"),
            "Finish (Total Seconds)": dtype("float64"),
        }
    )

    actual_output = london_cleaner(unclean_input)

    pd.testing.assert_frame_equal(actual_output, exp_output, check_categorical=False)

Пример #23

Показать файл

    def get_feature_df(self) -> Tuple[pd.DataFrame, List[Any]]:
        """
        Transform incoming data into pandas dataframe
        :return: tuple(features pandas.DataFrame, unqualified item id list)
        """
        # prepare features dataframe
        target_qs = self.get_queryset()
        all_sample_ids = list(target_qs.values_list('id', flat=True))
        # TODO: all documents ref. by all_sample_ids should be in feature_table
        feature_table: Optional[pd.DataFrame] = None
        counter = 'counter'

        for feature_source_item in self.feature_source:
            msg = f'Get "{feature_source_item}" feature data:'
            self.log_message(msg)
            self.log_message('_' * len(msg))

            # get aggregation queryset parameters for .annotate function
            source_model = self.source_models[feature_source_item]
            source_field = self.source_fields[feature_source_item]
            target_id_field = self.target_id_field
            aggregation = {counter: self.aggregation_function}

            # try to decrease memory usage iterating over chunks and using sparse dataframes
            # Note: pivot_table takes extra memory so use lower memory limits
            source_qs = source_model.objects.filter(**{target_id_field + '__in': all_sample_ids})

            if hasattr(source_model, 'text_unit'):
                source_qs = source_qs.filter(**{self.unit_type_filter: self.unit_type})

            ids = sorted(source_qs.order_by(target_id_field).values_list(target_id_field, flat=True).distinct())
            terms = sorted(source_qs.order_by(source_field).values_list(source_field, flat=True).distinct())
            id_count = len(ids)
            term_count = len(terms)

            self.log_message(f'{self.source_item}s containing "{feature_source_item}": {id_count}')
            self.log_message(f'unique "{feature_source_item}" items: {term_count}')

            if not term_count:
                self.log_message(f'WARN: there are no "{feature_source_item}" entities found')
                continue

            from_mem_chunk_size = self.get_chunk_size(term_count * 2)    # np.uint16 - 2 bytes
            chunk_size = min([self.max_chunk_size, from_mem_chunk_size])
            self.log_message(f'chunk_size from_mem/min/final: {from_mem_chunk_size}/{self.max_chunk_size}/{chunk_size}')

            # TODO: we stopped using pd.SparseDataFrame as there's no such class anymore
            single_feature_table = SparseSingleFeatureTable(feature_source_item)

            for step in range(0, id_count, chunk_size):
                self.log_message(f'...process "{feature_source_item}" feature: "{self.source_item}s" range: {step}-{step + chunk_size}')
                sample_ids = ids[step:step + chunk_size]

                chunk_qs = source_qs \
                    .filter(**{target_id_field + '__in': sample_ids}) \
                    .order_by(target_id_field, source_field) \
                    .values(target_id_field, source_field) \
                    .annotate(**aggregation)

                df_src = list(chunk_qs)
                chunk_df = pd.DataFrame.from_records(df_src)
                del chunk_qs
                gc.collect()  # try to free up memory

                doc_cat = CategoricalDtype(sample_ids, ordered=True)
                # TODO: fix for date features: pandas can't compare dates, but datetimes only
                if terms and isinstance(terms[0], datetime.date):
                    terms = [datetime.datetime.combine(d, datetime.datetime.min.time()) for d in terms]
                term_cat = CategoricalDtype(terms, ordered=True)

                row = [] if chunk_df.empty else chunk_df[self.target_id_field].astype(doc_cat).cat.codes
                col = [] if chunk_df.empty else chunk_df[source_field].astype(term_cat).cat.codes
                val = [] if chunk_df.empty else chunk_df[counter]
                sparse_matrix = scp.csr_matrix(
                    (val, (row, col)),
                    shape=(len(sample_ids), term_cat.categories.size),
                    dtype=np.uint16)
                single_feature_table.join(sparse_matrix)
                del chunk_df
                gc.collect()  # try to free up memory
                mem = psutil.virtual_memory()
                self.log_message(f'......available memory: {get_mb(mem.available)}M ({mem.percent}%)')

            # join feature_source_item-specific dataframe into results dataframe
            gc.collect()    # try to free up memory
            single_feature_df_src = SparseAllFeaturesTable(ids)
            single_feature_df_src.add_feature_table(single_feature_table, terms)

            if feature_table is None:
                feature_table = single_feature_df_src.to_dataframe()
            else:
                feature_table = feature_table.join(single_feature_df_src.to_dataframe(), how='outer')
            del single_feature_table
            del single_feature_df_src
            gc.collect()    # try to free up memory
            # end of "for feature_source_item in self.feature_source"

        df = feature_table
        if self.drop_empty_columns:
            df.dropna(axis=1, how='all', inplace=True)

        self.log_message(f'df: {get_df_info(df)}')
        mem = psutil.virtual_memory()
        self.log_message(f'available memory: {get_mb(mem.available)}M ({mem.percent}%)')

        if df.empty:
            msg = 'No features of chosen "feature_source" options {} detected. ' \
                  'Empty Data Set.'.format(str(self.feature_source))
            raise EmptyDataSetError(msg, feature_source=self.feature_source)

        # item ids not included in feature df which don't have features at all
        initial_id_set = set(target_qs.values_list('id', flat=True))
        feature_id_set = set(df.index.tolist())
        unqualified_item_ids = sorted(list(initial_id_set.difference(feature_id_set)))

        self.log_message('count unqualified_item_ids: {}'.format(len(unqualified_item_ids)))

        if not self.drop_empty_rows and unqualified_item_ids:
            unqualified_items_df = pd.DataFrame(index=unqualified_item_ids, columns=df.columns).fillna(0)

            self.log_message('unqualified_items_df shape: {} size: {}'.format(
                unqualified_items_df.shape, unqualified_items_df.memory_usage().sum()))

            df = pd.concat([df, unqualified_items_df]).fillna(0).astype(np.uint16)

            self.log_message(f'df: {get_df_info(df)}')

        return df, unqualified_item_ids

Пример #24

Показать файл

Файл: test_astype.py Проект: youyou3418/pandas

class TestAstype:
    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see GH#14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame({
            "a":
            list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b":
            list(map(str, map(Timestamp, b._values))),
            "c":
            list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
            "d":
            list(map(str, d._values)),
            "e":
            list(map(str, e._values)),
        })

        tm.assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see GH#11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        tm.assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457"
        expected = DataFrame([val])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series(["0", "1", "2", "3", "4"]),
            "c":
            c,
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
            "c":
            Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        tm.assert_frame_equal(df.astype(dt3), df.astype(str))
        tm.assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = "Only a column name can be used for the key in a dtype mappings argument"
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        tm.assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

        # GH#16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({}) if dtype_class is dict else dtype_class(
            {}, dtype=object)
        equiv = df.astype(dt7)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"],
                       dtype=str,
                       name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH#18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = f"Expected an instance of {cls.__name__}"
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH#22578
        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])

        expected1 = DataFrame({
            "a": integer_array([1, 3, 5], dtype=dtype),
            "b": integer_array([2, 4, 6], dtype=dtype),
        })
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = DataFrame({
            "a": [1.0, 3.0, 5.0],
            "b": integer_array([2, 4, 6], dtype=dtype)
        })
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH#22578
        df = DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH#24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", [{
        100: "float64",
        200: "uint64"
    }, "category", "float64"])
    def test_astype_column_metadata(self, dtype):
        # GH#19920
        columns = UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_object(self, dtype, unit):
        # tests astype to object dtype
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == Timestamp(1, unit=unit)
        else:
            assert result.iloc[0, 0] == Timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # GH#19223
        dtype = f"M8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # GH#19224
        dtype = f"M8[{unit}]"
        other = f"m8[{unit}]"

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to "
               fr"\[timedelta64\[{unit}\]\]")
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = (fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to "
               fr"\[datetime64\[{unit}\]\]")
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    def test_astype_arg_for_errors(self):
        # GH#14878

        df = DataFrame([1, 2, 3])

        msg = ("Expected value of kwarg 'errors' to be one of "
               "['raise', 'ignore']. Supplied value is 'True'")
        with pytest.raises(ValueError, match=re.escape(msg)):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_astype_arg_for_errors_dictlist(self):
        # GH#25905
        df = DataFrame([
            {
                "a": "1",
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": "2.2",
                "b": "15.3",
                "c": "another_test"
            },
        ])
        expected = DataFrame([
            {
                "a": 1.0,
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": 2.2,
                "b": "15.3",
                "c": "another_test"
            },
        ])
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz(self, timezone_frame):
        # astype
        expected = np.array(
            [
                [
                    Timestamp("2013-01-01 00:00:00"),
                    Timestamp("2013-01-02 00:00:00"),
                    Timestamp("2013-01-03 00:00:00"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
                ],
            ],
            dtype=object,
        ).T
        expected = DataFrame(
            expected,
            index=timezone_frame.index,
            columns=timezone_frame.columns,
            dtype=object,
        )
        result = timezone_frame.astype(object)
        tm.assert_frame_equal(result, expected)

        result = timezone_frame.astype("datetime64[ns]")
        expected = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            (date_range("20130101", periods=3,
                        tz="US/Eastern").tz_convert("UTC").tz_localize(None)),
            "C": (date_range("20130101", periods=3,
                             tz="CET").tz_convert("UTC").tz_localize(None)),
        })
        expected.iloc[1, 1] = NaT
        expected.iloc[1, 2] = NaT
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz_to_str(self, timezone_frame):
        # str formatting
        result = timezone_frame.astype(str)
        expected = DataFrame(
            [
                [
                    "2013-01-01",
                    "2013-01-01 00:00:00-05:00",
                    "2013-01-01 00:00:00+01:00",
                ],
                ["2013-01-02", "NaT", "NaT"],
                [
                    "2013-01-03",
                    "2013-01-03 00:00:00-05:00",
                    "2013-01-03 00:00:00+01:00",
                ],
            ],
            columns=timezone_frame.columns,
        )
        tm.assert_frame_equal(result, expected)

        with option_context("display.max_columns", 20):
            result = str(timezone_frame)
            assert (
                "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
            ) in result
            assert (
                "1 2013-01-02                       NaT                       NaT"
            ) in result
            assert (
                "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
            ) in result

Пример #25

Показать файл

    def replot_single(self, i):
        col = self.joint_data.columns[i]
        self.outboxes[i].clear_output(wait=True)
        with self.outboxes[i]:
            plt.clf()
            fig = plt.gcf()
            fig.set_figwidth(3)
            fig.set_figheight(1)

            if self.scope.get_dtype(col) in ('cat', 'bool'):

                if self.scope.get_dtype(col) == 'cat':
                    bar_labels = self.scope.get_cat_values(col)
                else:
                    bar_labels = [False, True]

                v = self.joint_data[col].astype(
                    CategoricalDtype(categories=bar_labels,
                                     ordered=False)).cat.codes
                bar_heights, _ = numpy.histogram(v,
                                                 bins=numpy.arange(
                                                     0,
                                                     len(bar_labels) + 1))
                bar_x = numpy.arange(0, len(bar_labels))
                plt.bar(bar_x, bar_heights, 0.8, align='edge')
                filter_vals = self.joint_data.loc[self.joint_filters.all(
                    axis=1), col].astype(
                        CategoricalDtype(categories=bar_labels,
                                         ordered=False)).cat.codes
                bar_heights, _ = numpy.histogram(filter_vals,
                                                 bins=numpy.arange(
                                                     0,
                                                     len(bar_labels) + 1))
                plt.bar(bar_x, bar_heights, 0.8, align='edge')
                plt.xticks(bar_x + 0.4, [str(i) for i in bar_labels])
                plt.show()

                # bar_labels, bar_heights = numpy.unique(self.joint_data[col], return_counts=True)
                # bar_x = numpy.arange(0,len(bar_labels))
                # plt.bar(bar_x, bar_heights, 0.8, align='edge')
                # from pandas import CategoricalDtype
                # filter_vals = self.joint_data.loc[self.joint_filters.all(axis=1), col].astype(
                # 	CategoricalDtype(categories=bar_labels, ordered=False)
                # ).cat.codes
                # bar_heights, _ = numpy.histogram(filter_vals, bins=numpy.arange(0,len(bar_labels)+1))
                # plt.bar(bar_x, bar_heights, 0.8, align='edge')
                # plt.xticks(bar_x+0.4, bar_labels)
                # plt.show()
            else:

                bins = 20 if col not in self.data.strategy_names else 20
                #n, bins, patches = plt.hist(self.joint_data[col], bins=bins)
                bar_heights, bar_x = numpy.histogram(self.joint_data[col],
                                                     bins=bins)
                plt.bar(bar_x[:-1],
                        bar_heights,
                        bar_x[1:] - bar_x[:-1],
                        align='edge')
                #n, bins, patches = plt.hist(self.joint_data.loc[self.joint_filters.all(axis=1), col], bins=bins)

                bar_heights, bar_x = numpy.histogram(
                    self.joint_data.loc[self.joint_filters.all(axis=1), col],
                    bins=bar_x)
                plt.bar(bar_x[:-1],
                        bar_heights,
                        bar_x[1:] - bar_x[:-1],
                        align='edge')
                plt.show()

Пример #26

Показать файл

class TestAstype:
    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    @td.skip_array_manager_not_yet_implemented
    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see GH#14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame({
            "a":
            list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b":
            list(map(str, map(Timestamp, b._values))),
            "c":
            list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
            "d":
            list(map(str, d._values)),
            "e":
            list(map(str, e._values)),
        })

        tm.assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see GH#11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        tm.assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        val = "1.1234567890123457"
        expected = DataFrame([val])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series(["0", "1", "2", "3", "4"]),
            "c":
            c,
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
            "c":
            Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        tm.assert_frame_equal(df.astype(dt3), df.astype(str))
        tm.assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = "Only a column name can be used for the key in a dtype mappings argument"
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        tm.assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

        # GH#16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({}) if dtype_class is dict else dtype_class(
            {}, dtype=object)
        equiv = df.astype(dt7)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"],
                       dtype=str,
                       name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH#18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = f"Expected an instance of {cls.__name__}"
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH#22578
        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])

        expected1 = DataFrame({
            "a": pd.array([1, 3, 5], dtype=dtype),
            "b": pd.array([2, 4, 6], dtype=dtype),
        })
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = DataFrame({
            "a": [1.0, 3.0, 5.0],
            "b": pd.array([2, 4, 6], dtype=dtype)
        })
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH#22578
        df = DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH#24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", [{
        100: "float64",
        200: "uint64"
    }, "category", "float64"])
    def test_astype_column_metadata(self, dtype):
        # GH#19920
        columns = UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_object(self, dtype, unit):
        # tests astype to object dtype
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == Timestamp(1, unit=unit)
        else:
            assert result.iloc[0, 0] == Timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # GH#19223
        dtype = f"M8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # GH#19224
        dtype = f"M8[{unit}]"
        other = f"m8[{unit}]"

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]"
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]"
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    @td.skip_array_manager_not_yet_implemented
    def test_astype_arg_for_errors(self):
        # GH#14878

        df = DataFrame([1, 2, 3])

        msg = ("Expected value of kwarg 'errors' to be one of "
               "['raise', 'ignore']. Supplied value is 'True'")
        with pytest.raises(ValueError, match=re.escape(msg)):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_astype_arg_for_errors_dictlist(self):
        # GH#25905
        df = DataFrame([
            {
                "a": "1",
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": "2.2",
                "b": "15.3",
                "c": "another_test"
            },
        ])
        expected = DataFrame([
            {
                "a": 1.0,
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": 2.2,
                "b": "15.3",
                "c": "another_test"
            },
        ])
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz(self, timezone_frame):
        # astype
        expected = np.array(
            [
                [
                    Timestamp("2013-01-01 00:00:00"),
                    Timestamp("2013-01-02 00:00:00"),
                    Timestamp("2013-01-03 00:00:00"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
                ],
            ],
            dtype=object,
        ).T
        expected = DataFrame(
            expected,
            index=timezone_frame.index,
            columns=timezone_frame.columns,
            dtype=object,
        )
        result = timezone_frame.astype(object)
        tm.assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            # dt64tz->dt64 deprecated
            result = timezone_frame.astype("datetime64[ns]")
        expected = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            (date_range("20130101", periods=3,
                        tz="US/Eastern").tz_convert("UTC").tz_localize(None)),
            "C": (date_range("20130101", periods=3,
                             tz="CET").tz_convert("UTC").tz_localize(None)),
        })
        expected.iloc[1, 1] = NaT
        expected.iloc[1, 2] = NaT
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz_to_str(self, timezone_frame):
        # str formatting
        result = timezone_frame.astype(str)
        expected = DataFrame(
            [
                [
                    "2013-01-01",
                    "2013-01-01 00:00:00-05:00",
                    "2013-01-01 00:00:00+01:00",
                ],
                ["2013-01-02", "NaT", "NaT"],
                [
                    "2013-01-03",
                    "2013-01-03 00:00:00-05:00",
                    "2013-01-03 00:00:00+01:00",
                ],
            ],
            columns=timezone_frame.columns,
        )
        tm.assert_frame_equal(result, expected)

        with option_context("display.max_columns", 20):
            result = str(timezone_frame)
            assert (
                "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
            ) in result
            assert (
                "1 2013-01-02                       NaT                       NaT"
            ) in result
            assert (
                "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
            ) in result

    def test_astype_empty_dtype_dict(self):
        # issue mentioned further down in the following issue's thread
        # https://github.com/pandas-dev/pandas/issues/33113
        df = DataFrame()
        result = df.astype({})
        tm.assert_frame_equal(result, df)
        assert result is not df

    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) ignore keyword
    @pytest.mark.parametrize(
        "df",
        [
            DataFrame(Series(["x", "y", "z"], dtype="string")),
            DataFrame(Series(["x", "y", "z"], dtype="category")),
            DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
            DataFrame(Series(3 * [Interval(0, 1)])),
        ],
    )
    @pytest.mark.parametrize("errors", ["raise", "ignore"])
    def test_astype_ignores_errors_for_extension_dtypes(self, df, errors):
        # https://github.com/pandas-dev/pandas/issues/35471
        if errors == "ignore":
            expected = df
            result = df.astype(float, errors=errors)
            tm.assert_frame_equal(result, expected)
        else:
            msg = "(Cannot cast)|(could not convert)"
            with pytest.raises((ValueError, TypeError), match=msg):
                df.astype(float, errors=errors)

    def test_astype_tz_conversion(self):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        df = DataFrame(val)
        result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})

        expected = df
        expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
    def test_astype_tz_object_conversion(self, tz):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        expected = DataFrame(val)

        # convert expected to object dtype from other tz str (independently tested)
        result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
        result = result.astype({"tz": "object"})

        # do real test: object dtype to a specified tz, different from construction tz.
        result = result.astype({"tz": "datetime64[ns, Europe/London]"})
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture,
                                   request):
        tz = tz_naive_fixture
        if tz is None:
            mark = pytest.mark.xfail(
                reason=
                "GH#36153 uses ndarray formatting instead of DTA formatting")
            request.node.add_marker(mark)

        dti = date_range("2016-01-01", periods=3, tz=tz)
        dta = dti._data
        dta[0] = NaT

        obj = frame_or_series(dta)
        result = obj.astype("string")

        # Check that Series/DataFrame.astype matches DatetimeArray.astype
        expected = frame_or_series(dta.astype("string"))
        tm.assert_equal(result, expected)

        item = result.iloc[0]
        if frame_or_series is DataFrame:
            item = item.iloc[0]
        assert item is pd.NA

        # For non-NA values, we should match what we get for non-EA str
        alt = obj.astype(str)
        assert np.all(alt.iloc[1:] == result.iloc[1:])

    def test_astype_bytes(self):
        # GH#39474
        result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
        assert result.dtypes[0] == np.dtype("S3")

Пример #27

Показать файл

Файл: predict.py Проект: yoru22413/metro-traffic

def predict(model: keras.Model, standard_scaler: CustomStandardScaler,
            tf_idf: TfidfVectorizer, column_dummies, df_past, df_future):
    """model : Keras Model"""
    df_past.date_time = pd.to_datetime(df_past.date_time)
    df_future.date_time = pd.to_datetime(df_future.date_time)

    df_past.holiday = df_past.holiday != 'None'
    df_future.holiday = df_future.holiday != 'None'
    t = tf_idf.transform(df_past.weather_description)
    t2 = tf_idf.transform(df_future.weather_description)
    df_past = pd.concat([
        df_past,
        pd.DataFrame(data=t.toarray(), index=df_past.index).add_prefix('tag_')
    ],
                        axis=1)
    df_future = pd.concat([
        df_future,
        pd.DataFrame(data=t2.toarray(),
                     index=df_future.index).add_prefix('tag_')
    ],
                          axis=1)
    df_past.drop(columns='weather_description', inplace=True)
    df_future.drop(columns='weather_description', inplace=True)

    df_past['hour'] = df_past.date_time.dt.hour
    df_past['weekday'] = df_past.date_time.dt.day_name()
    df_past['day'] = df_past.date_time.dt.day
    df_past['month'] = df_past.date_time.dt.month_name()
    df_past.holiday = df_past.holiday.astype(int)

    df_future['hour'] = df_future.date_time.dt.hour
    df_future['weekday'] = df_future.date_time.dt.day_name()
    df_future['day'] = df_future.date_time.dt.day
    df_future['month'] = df_future.date_time.dt.month_name()
    df_future.holiday = df_future.holiday.astype(int)

    for col, values in column_dummies.items():
        df_future[col] = df_future[col].astype(CategoricalDtype(values))
        df_past[col] = df_past[col].astype(CategoricalDtype(values))

    df_past = df_past.join(
        pd.get_dummies(df_past.weather_main, prefix='weather'))
    df_past = df_past.join(pd.get_dummies(df_past.hour, prefix='hour'))
    df_past = df_past.join(pd.get_dummies(df_past.weekday, prefix='weekday'))
    df_past = df_past.join(pd.get_dummies(df_past.day, prefix='day'))
    df_past = df_past.join(pd.get_dummies(df_past.month, prefix='month'))

    df_future = df_future.join(
        pd.get_dummies(df_future.weather_main, prefix='weather'))
    df_future = df_future.join(pd.get_dummies(df_future.hour, prefix='hour'))
    df_future = df_future.join(
        pd.get_dummies(df_future.weekday, prefix='weekday'))
    df_future = df_future.join(pd.get_dummies(df_future.day, prefix='day'))
    df_future = df_future.join(pd.get_dummies(df_future.month, prefix='month'))

    df_past = df_past.drop(
        columns=['weather_main', 'hour', 'weekday', 'day', 'month'])
    df_future = df_future.drop(
        columns=['weather_main', 'hour', 'weekday', 'day', 'month'])

    df_past.drop(columns='date_time', inplace=True)
    df_future.drop(columns='date_time', inplace=True)

    traffic = df_past['traffic_volume'].values.reshape(-1, 1)

    df_past, df_future, traffic = standard_scaler.transform(
        [df_past, df_future, traffic])

    df_future = df_future[np.newaxis, :]
    df_past = df_past[np.newaxis, :]

    y = model.predict((df_past, df_future))
    y = tf.squeeze(y)
    y = y.numpy()
    return standard_scaler.ss[2].inverse_transform(y)