예제 #1
0
 def inverse_transform(self, y, **transform_params):
     notify.entering(__class__.__name__, "inverse_transform")
     
     d = {"Sale_Price": np.exp(y["Sale_Price"].values)}
     df = pd.DataFrame(data=d)
     notify.leaving(__class__.__name__, "inverse_transform")        
     return df
예제 #2
0
 def fit(self, X, y=None, **fit_params):
     notify.entering(__class__.__name__, "fit")
     self.original_features_ = X.columns.tolist()
     self.ohe_ = OneHotEncoder(handle_unknown="ignore")
     self.ohe_.fit(X)
     notify.leaving(__class__.__name__, "fit")
     return self
예제 #3
0
    def _transform(self, X, y=None):
        notify.entering(__class__.__name__, "transform")
        
        X["Garage_Yr_Blt"].replace(to_replace=2207, value=2007, inplace=True)


        X = X.drop(columns=["Latitude", "Longitude"])
        X = X.fillna(X.median())
        notify.leaving(__class__.__name__, "transform")
        return X
예제 #4
0
    def _transform(self, X, y=None,  **transform_params):        
        notify.entering(__class__.__name__, "transform")
        notify.leaving(__class__.__name__, "transform")
        self._encoder.fit(X, y)
        X = self._encoder.transform(X, y)

        # Scale the features and standardize to zero mean unit variance
        scaler = StandardScaler()        
        X[self._nominal] = scaler.fit_transform(X[self._nominal])
        #X = X.fillna(X.mean())
        return X
예제 #5
0
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        categorical = list(X.select_dtypes(include=["object"]).columns)
        # Create imputer object
        imputer = SimpleImputer(strategy="most_frequent")
        
        # Perform imputation of categorical variables to most frequent
        X[categorical] = imputer.fit_transform(X[categorical])        

        notify.leaving(__class__.__name__, "transform")
        
        return X        
예제 #6
0
    def run(self, X, y=None):
        notify.entering(__class__.__name__, "run")
        # Add an age feature and remove year built
        X["Age"] = X["Year_Sold"] - X["Year_Built"]
        X["Age"].fillna(X["Age"].median())        

        # Add age feature for garage.
        X["Garage_Age"] = X["Year_Sold"] - X["Garage_Yr_Blt"]
        X["Garage_Age"].fillna(value=0,inplace=True)        

        notify.leaving(__class__.__name__, "run")
        return X, y
예제 #7
0
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        # Impute missing values as linear function of other features
        imputer = IterativeImputer()
        X[self._continuous] = imputer.fit_transform(X[self._continuous])

        # Power transformation to make feature distributions closer to Guassian
        power = PowerTransformer(method="yeo-johnson", standardize=False)
        X[self._continuous] = power.fit_transform(X[self._continuous])

        notify.leaving(__class__.__name__, "transform")
        
        return X
예제 #8
0
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        for variable, mappings in self._ordinal_map.items():
            for k,v in mappings.items():
                X[variable].replace({k:v}, inplace=True)       

        # Scale data as continuous 
        scaler = StandardScaler()        
        X[ordinal] = scaler.fit_transform(X[ordinal])   

        notify.leaving(__class__.__name__, "transform")                  

        return X
예제 #9
0
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        # Missing discrete variables will be imputed according to the strategy provided
        # Default strategy is the mean.
        imputer = SimpleImputer(strategy=self._strategy)
        X[self._discrete] = imputer.fit_transform(X[self._discrete])

        # Standardize discrete variables to zero mean and unit variance
        scaler = StandardScaler()        
        X[self._discrete] = scaler.fit_transform(X[self._discrete])
        
        notify.leaving(__class__.__name__, "transform")

        return X        
예제 #10
0
    def run(self, X, y=None, **fit_params):
        notify.entering(__class__.__name__, "run")
        # Add an age feature and remove year built
        X["Age"] = X["Year_Sold"] - X["Year_Built"]
        X["Age"].fillna(X["Age"].median())

        # Remove longitude and latitude
        X = X.drop(columns=["Latitude", "Longitude"])

        # Remove outliers 
        idx = X[(X["Gr_Liv_Area"] <= 4000) & (X["Garage_Yr_Blt"]<=2010)].index.tolist()
        X = X.iloc[idx]
        y = y.iloc[idx]

        notify.leaving(__class__.__name__, "run")
        return X, y
예제 #11
0
    def transform(self, X,  **transform_params):       
        """Converting nominal variables to one-hot representation."""        
        notify.entering(__class__.__name__, "transform")

        self.X_ = self.ohe_.transform(X).toarray()
        self.transformed_features_ = self.ohe_.get_feature_names(self.original_features_).tolist()        
        self.X_df_ = pd.DataFrame(self.X_, columns=self.transformed_features_)
        self.to_original_ = {}
        
        for i in self.transformed_features_:
            for j in self.original_features_:
                if j in i:
                    self.to_original_[i] = j
                    break        

        notify.leaving(__class__.__name__, "transform")
        return self.X_ 
예제 #12
0
    def _transform(self, X, y=None):
        notify.entering(__class__.__name__, "_fit")
        self.features_removed_ = []
        correlations = pd.DataFrame()
        all_columns = X.columns.tolist()
        columns = list(set(X.columns.tolist()).intersection(self._numeric))

        # Perform pairwise correlation coefficient calculations
        for col_a, col_b in itertools.combinations(columns,2):
            r, p = pearsonr(X[col_a], X[col_b])
            cols =  col_a + "__" + col_b
            d = {"Columns": cols, "A": col_a, "B": col_b,"Correlation": r, "p-value": p}
            df = pd.DataFrame(data=d, index=[0])
            correlations = pd.concat((correlations, df), axis=0)

        # Now compute correlation between features and target.
        relevance = pd.DataFrame()
        for column in columns:
            r, p = pearsonr(X.loc[:,column], y)
            d = {"Feature": column, "Correlation": r, "p-value": p}
            df = pd.DataFrame(data=d, index=[0])
            relevance = pd.concat((relevance,df), axis=0)

        # Obtain observations above correlation threshold and below alpha
        self.suspects_ = correlations[(correlations["Correlation"] >= self._threshold) & (correlations["p-value"] <= self._alpha)]
        if self.suspects_.shape[0] == 0:
            self.X_ = X
            return

        # Iterate over suspects and determine column to remove based upon
        # correlation with target
        to_remove = []
        for index, row in self.suspects_.iterrows():
            a = np.abs(relevance[relevance["Feature"] == row["A"]]["Correlation"].values)
            b = np.abs(relevance[relevance["Feature"] == row["B"]]["Correlation"].values)
            if a > b:
                to_remove.append(row["B"])
            else:
                to_remove.append(row["A"])

        self.X_ = X.drop(columns=to_remove)
        self.features_removed_ += to_remove
        self._fit(self.X_,y)
        notify.leaving(__class__.__name__, "fit_")
        return self.X_
예제 #13
0
    def _transform(self, X, y=None,  **transform_params):
        notify.entering(__class__.__name__, "transform")
        categorical = list(X.select_dtypes(include=["object"]).columns)
        # Create imputer object
        imputer = SimpleImputer(strategy=self._strategy)

        # Perform imputation of categorical variables to most frequent
        X[self._ordinal] = imputer.fit_transform(X[self._ordinal])

        # Map levels to ordinal mappings
        for variable, mappings in self._ordinal_map.items():
            for k,v in mappings.items():
                X[variable].replace({k:v}, inplace=True)

        # Scale the features and standardize to zero mean unit variance
        scaler = StandardScaler()
        X[self._ordinal] = scaler.fit_transform(X[self._ordinal])

        notify.leaving(__class__.__name__, "transform")

        return X
예제 #14
0
    def _transform(self, X, y=None):
        notify.entering(__class__.__name__, "_fit")
        results = pd.DataFrame()
        all_columns = X.columns.tolist()
        categorical = self._continuous + self._discrete
        columns = list(set(X.columns.tolist()).intersection(categorical))

        # Measure variance between predictor levels w.r.t. the response
        self.remaining_ = pd.DataFrame()
        self.features_removed_ = []
        for column in columns:
            r, p = pearsonr(X[column], y)
            if (np.abs(r) <= self._threshold) & (p <= self._alpha):
                self.features_removed_.append(column)
            else:
                d = {"Feature": column, "Correlation": r, "p-value": p}
                df = pd.DataFrame(data=d, index=[0])
                self.remaining_ = pd.concat((self.remaining_, df), axis=0)

        # Drop features
        self.X_ = X.drop(columns=self.features_removed_)
        notify.leaving(__class__.__name__, "_fit")
        return self.X_
예제 #15
0
    def _transform(self, X, y=None):
        notify.entering(__class__.__name__, "_fit")
        results = pd.DataFrame()
        all_columns = X.columns.tolist()
        categorical = self._ordinal + self._nominal
        columns = list(set(X.columns.tolist()).intersection(categorical))

        # Measure variance between predictor levels w.r.t. the response
        self.remaining_ = pd.DataFrame()
        self.features_removed_ = []
        for column in columns:
            f, p = f_oneway(X[column], y)
            if p > self._alpha:
                self.features_removed_.append(column)
            else:
                d = {"Feature": column, "F-statistic": f, "p-value": p}
                df = pd.DataFrame(data=d, index=[0])
                self.remaining_ = pd.concat((self.remaining_, df), axis=0)

        # Drop features
        self.X_ = X.drop(columns=self.features_removed_)
        notify.leaving(__class__.__name__, "_fit")
        return self.X_
예제 #16
0
    def transform(self, X,  **transform_params):       
        notify.entering(__class__.__name__, "transform")
        notify.leaving(__class__.__name__, "transform")                  

        return self._enc.transform(X)
예제 #17
0
 def _fit(self, X, y=None, **fit_params):        
     notify.entering(__class__.__name__, "fit")
     notify.leaving(__class__.__name__, "fit")
     return self