Python dataframe.select_dtypes 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: dask

클래스/타입: dataframe

메소드/함수: select_dtypes

hotexamples.com에서의 예제들: 4

Python dataframe.select_dtypes - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 dask.dataframe.select_dtypes에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

groupby(7)

select_dtypes(4)

compute(3)

set_index(3)

dropna(3)

reset_index(3)

map_partitions(3)

append(2)

isnull(2)

drop(2)

drop_duplicates(2)

join(1)

get_partition(1)

reindex(1)

fillna(1)

sample(1)

to_delayed(1)

예제 #1

파일 보기

    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(exclude=[np.number, "datetime64[ns]"])

        # Calculate the entropy column-wisely
        entropies_df = subset.compute().apply(entropy,
                                              axis=0).to_frame(name="entropy")
        entropies_df.reset_index(inplace=True)
        entropies_df.rename(columns={"index": "column_name"}, inplace=True)
        entropies_df.sort_values(by="entropy", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.entropy_thresholds]
        mask_entropy = entropies_df["entropy"].between(
            min(thresholds), max(thresholds), inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(entropies_df.loc[~mask_entropy,
                                                   "column_name"].values)
        mask_removed = entropies_df["column_name"].isin(self.feature_names)
        entropies_df.loc[mask_removed, "filtered_entropy"] = 1

        return self

예제 #2

파일 보기

def make_filter_std_pipeline(data: dd,
                             numerical_columns: list[str] or bool = True,
                             thresholds: list[float] = None,
                             inclusive: bool = False):
    #TODO: write unit tests
    """
    Makes pipeline to filter columns according to standard deviation

    Args:
        data (dd): Data frame to be filtered
        numerical_columns (list or bool, optional): Columns to subset the filtering. Defaults to True.
        thresholds (list, optional): Interval of std values to filter. Defaults to None.
        inclusive (bool, optional):  Includes or not the interval boundaries. Defaults to False.

    Returns:
        EPipeline: Pipeline to filter data frame
    """
    selected_columns = data.select_dtypes(
        include=[np.number]).columns.values if isinstance(
            numerical_columns, bool) else numerical_columns
    steps = [("extract", Extract(selected_columns)),
             ("std_filter",
              Filter_Std(std_thresholds=thresholds, inclusive=inclusive))]

    return EPipeline(steps)

예제 #3

파일 보기

def make_filter_entropy_pipeline(data: dd,
                                 categorical_columns: list[str] or bool = True,
                                 thresholds: list[float] = None,
                                 inclusive: bool = False):
    #TODO: write unit tests
    selected_columns = data.select_dtypes(
        exclude=[np.number], include=["object"]) if isinstance(
            categorical_columns, bool) else categorical_columns
    steps = [("extract", Extract(selected_columns)),
             ("entropy_filter",
              Filter_Entropy(entropy_thresholds=thresholds,
                             inclusive=inclusive))]

    return EPipeline(steps)

예제 #4

파일 보기

    def fit(self, X: dd, y=None):
        """Calculate what columns should be removed, based on the defined thresholds

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            None
        """
        subset = X.select_dtypes(include=[np.number])

        # Calculate the standad deviation column-wisely
        stds = np.nanstd(subset, axis=0)

        stds_df = pd.DataFrame.from_dict({
            "column_name": subset.columns.values,
            "std": stds
        })

        stds_df.sort_values(by="std", inplace=True, ascending=False)

        # Get thresholds and calculate what columns will be removed
        thresholds = [float(value) for value in self.std_thresholds]
        mask_variance = stds_df["std"].between(min(thresholds),
                                               max(thresholds),
                                               inclusive=self.inclusive)

        # Get list of columns to be removed
        self.feature_names = list(stds_df.loc[~mask_variance,
                                              "column_name"].values)
        mask_removed = stds_df["column_name"].isin(self.feature_names)

        stds_df.loc[mask_removed, "filtered_variance"] = 1
        stds_df.loc[~mask_removed, "filtered_variance"] = 0

        return self