Exemplo n.º 1
0
 def clean_features(or_df: pd.DataFrame,
                    features_def,
                    threshold=dict(),
                    count_nan=False,
                    **kwds) -> pd.DataFrame:
     '''
     clean features,if you want plot features please use AutoSelect;contain remove_highly_null_features;
     remove_single_value_features;remove_highly_correlated_features.
     '''
     #drop cols by remove function
     or_df, features_def = remove_highly_null_features(
         or_df,
         features=features_def,
         pct_null_threshold=threshold.get('remove_null', 0.95))
     or_df, features_def = remove_single_value_features(
         or_df, features=features_def, count_nan_as_value=count_nan)
     or_df, features_def = remove_highly_correlated_features(
         or_df,
         features=features_def,
         pct_corr_threshold=threshold.get('remove_corr', 0.95),
         **kwds)
     return or_df, features_def
Exemplo n.º 2
0
    def run_dfs(self,
                max_depth=1,
                features_only=True,
                ignore_variables=None,
                reduce_mem=False,
                reduce_feats=True,
                trans_primitives=None,
                agg_primitives=None,
                chunk_size=None,
                n_jobs=1,
                **kwargs):
        """Deep Feature Synthesisf
        agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation
            Feature types to apply.

                Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
                DateTime: ['time_since_last', 'time_since_first', 'trend']

        trans_primitives (list[str or TransformPrimitive], optional):
            List of Transform Feature functions to apply.

                Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

        groupby_trans_primitives (list[str or :class:`.primitives.TransformPrimitive`], optional):
            list of Transform primitives to make GroupByTransformFeatures with

        """
        if ignore_variables is None:
            # ignore_variables = [self.target_entity_id, self.index]
            # ignore_variables = ["__id"]  # 忽略单值id 会少了一些count特征
            ignore_variables = []

        if trans_primitives is None:
            trans_primitives = [
                "year",
                "month",
                "day",
                "hour",
                "minute",
                "week",
                "weekday",
                "is_weekend",
                'time_since_previous',
                # diff # https://stackoverflow.com/questions/60324672/how-is-time-since-previous-computed-in-featuretools
                Quarter(),
            ]

        _ = ft.dfs(
            entityset=self.es,
            target_entity=self.
            target_entity_id,  # 具有唯一ID: 不重复id的base_es或者normalize_entity生成的唯一id es
            features_only=features_only,
            max_depth=max_depth,
            ignore_variables={self.entity_id: ignore_variables},
            chunk_size=chunk_size,
            n_jobs=n_jobs,
            verbose=1,
            agg_primitives=agg_primitives,
            trans_primitives=trans_primitives,
            **kwargs)

        if features_only:
            return _
        else:
            df_ = _[0].add_prefix(f'{self.entity_id}_').reset_index()

            if reduce_feats:
                cprint("remove_low_information_features")
                df_ = remove_low_information_features(df_)

                cprint("remove_single_value_features")
                df_ = remove_single_value_features(df_,
                                                   count_nan_as_value=True)

                cprint("remove_duplicate_features")
                dups = duplicate_columns(df_)
                df_ = df_.drop(dups, 1)

            if reduce_mem:
                df_ = reduce_mem_usage(df_)

            return df_
Exemplo n.º 3
0
 def _reduce_feats(self, df):
     df = remove_low_information_features(df)
     df = remove_single_value_features(df, count_nan_as_value=True)
     df.drop(duplicate_columns(df), 1, inplace=True)
     return df
Exemplo n.º 4
0
    def prosperity(
            countries=['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']):

        url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
        df = pd.read_csv(url + 'merged.csv')
        df = df.drop(['Unnamed: 0'], axis=1)

        metrics = [
            'educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove',
            'envi'
        ]
        ranks = ['rank_' + metric for metric in metrics]
        drop = metrics + ranks + ['year', 'prosperity_score']

        y = df['prosperity_score']

        df = df.drop(drop, axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df

        problem_type = 'regression'
        objective = 'auto'

        automl = evalml.automl.AutoMLSearch(problem_type=problem_type,
                                            objective=objective)

        #automl.search(X,y)
        #best_pipeline = automl.best_pipeline
        #best_pipeline.fit(X,y)
        #best_pipeline.save('prosperity_best_pipeline')

        best_pipeline = automl.load('prosperity_best_pipeline')

        test = pd.read_csv(url + 'test.csv', index_col=0)

        drop = ['year']
        df = test.copy()
        df = df.drop(drop, axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df

        predictions = best_pipeline.predict(X)

        result = pd.DataFrame()

        result['prosperity'] = predictions

        df = pd.read_csv(url + 'test.csv')
        temp = df[['country', 'year']]

        result = pd.merge(left=temp,
                          right=result,
                          how="left",
                          on=[temp.index, result.index])
        result = result.drop(['key_0', 'key_1'], axis=1)

        result['rank_prosperity'] = result.groupby("year")["prosperity"].rank(
            "dense", ascending=False)
        result['rank_prosperity'] = result['rank_prosperity'].astype('int')

        result = result[result['country'].isin(countries)]

        metric = pd.read_csv(
            'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/prosperity_metrics.csv'
        )

        return result, metric
Exemplo n.º 5
0
    def pillar(name='busi', countries=['Chad']):
        url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
        df = pd.read_csv(url + name + '_train.csv')
        df = df.drop(['Unnamed: 0'], axis=1)
        for i in df.columns:
            if i.find('year') > -1:
                df = df.drop([i], axis=1)
        y = df[name]

        df = df.drop(['rank_' + name, name], axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        X = df
        problem_type = 'regression'
        objective = 'auto'

        automl = evalml.automl.AutoMLSearch(problem_type=problem_type,
                                            objective=objective)

        best_pipeline = automl.load(name + '_best_pipeline')

        df = pd.read_csv(url + name + '_test.csv')
        df = df.drop(['Unnamed: 0'], axis=1)

        for i in df.columns:
            if i.find('year') > -1:
                df = df.drop([i], axis=1)

        df = remove_low_information_features(df)

        df = remove_highly_null_features(df)

        df = remove_single_value_features(df)

        df = remove_highly_correlated_features(df)

        predictions = best_pipeline.predict(df)

        result = pd.DataFrame()

        result[name] = predictions

        df = pd.read_csv(url + name + '_test.csv')
        temp = df[['country', 'year']]

        result = pd.merge(left=temp,
                          right=result,
                          how="left",
                          on=[temp.index, result.index])
        result = result.drop(['key_0', 'key_1'], axis=1)

        result['rank_' + name] = result.groupby("year")[name].rank(
            "dense", ascending=False)
        result['rank_' + name] = result['rank_' + name].astype('int')

        result = result[result['country'].isin(countries)]
        metric = pd.read_csv(
            'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Metrics/'
            + name + '_metrics.csv')

        return result, metric
Exemplo n.º 6
0
 def check_single_value(or_df, count_nan=False):
     use_cols = remove_single_value_features(
         or_df, count_nan_as_value=count_nan).columns.tolist()
     return list(set(or_df.columns.tolist()) - set(use_cols))