def predict(): if request.method == 'POST': try: if (float(request.form['OverallQuality']) >= 1 and float(request.form['OverallQuality']) <= 10 ): OverallQuality = float(request.form['OverallQuality']) OverallQuality_T = boxcox1p(OverallQuality , 0.15) else: OverallQuality_T = "" if (float(request.form['LivingArea']) <= 5600 and float(request.form['LivingArea']) >= 400): LivingArea = float(request.form['LivingArea']) LivingArea_T = boxcox1p(LivingArea , 0.15) else: LivingArea_T = "" if (float(request.form['GarageCars']) <= 4 and float(request.form['GarageCars']) >= 0 ): GarageCars = float(request.form['GarageCars']) GarageCars_T = boxcox1p(GarageCars, 0.15) else: GarageCars_T = "" with open('lgb_model.pkl', 'rb') as f: ml_model = pickle.load(f) pred_args = [OverallQuality_T, LivingArea_T, GarageCars_T] pred_args_arr = np.array(pred_args).reshape(1,-1) model_prediction = ml_model.predict(pred_args_arr) model_prediction = np.expm1(model_prediction) model_prediction = round(float(model_prediction), 2) except ValueError: return "Please check if the values are in the range!" return render_template('predict.html', prediction = model_prediction)
def adjust_skewness(df: pd.DataFrame, specific: str = None) -> pd.DataFrame: """ Adjusts the skewness of all columns by finding highly skewed columns and performing a boxcox transformation :param df: pandas DataFrame to adjust skewed columns in :return: pandas DataFrame with skew adjusted columns """ numerics = list(x[0] for x in ( filter(lambda x: x[1].name != 'object' and x[1].name != 'category', zip(df.columns, df.dtypes)))) skewed_feats = df[numerics].apply(lambda x: skew(x.dropna())).sort_values( ascending=False) skewness = pd.DataFrame({'Skew': skewed_feats}) skewness = skewness[abs(skewness) > 0.7] skewed_features = skewness.index if specific: skewed_features = [specific] lam = 0.15 for feat in skewed_features: boxcot_trans = boxcox1p(df[feat], lam) if not boxcot_trans.isnull().any(): df[feat] = boxcox1p(df[feat], lam) return df
def _transform_skewed_features(self, numerical_vars: np.ndarray) -> None: """ Private method for transforming features with high skew. :param numerical_vars: Set of all originally numerical variables. """ logging.info( f'#{self._index()} - Determine and transform skewed features...') # check the skew of all numerical features skewed_features = self.X[numerical_vars].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) logging.info("Skew in numerical features: \n") skewness = pd.DataFrame({'Skew': skewed_features}) logging.info(skewness) # transform skewed features skewed_features = skewness[abs(skewness.Skew) > 0.75].index logging.info(f"There are {skewed_features.size} skewed features") for feature in skewed_features: self.X[feature] = boxcox1p(self.X[feature], box_cox_lambda) self.X_test[feature] = boxcox1p(self.X_test[feature], box_cox_lambda) # check the skew of all numerical features again skewed_features = self.X[numerical_vars].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) logging.info("Skew in numerical features: \n") skewness = pd.DataFrame({'Skew': skewed_features}) logging.info(skewness) logging.info(f'#{self._step_index} - DONE!')
def transform_numerical_features(df_train, df_test): """ TODO currently deals with positive skewed features, not negative. Analyse this. :param df_train: :param df_test: :return: """ # apply log, scaling features # SalePrice. Check log against log(1+x), log1p df_train['SalePrice'] = np.log1p(df_train['SalePrice']) # Check for skewed features features_skewness = [] for k in df_train.columns: if k == 'SalePrice': pass elif df_train.dtypes[k] != object: features_skewness.append([k, np.float(stats.skew(df_train[k]))]) features_skewness_df = pd.DataFrame(features_skewness, columns=['F', 'S']).sort_values(by='S') left_skewed_features = features_skewness_df[ features_skewness_df['S'] > 0.5]['F'] right_skewed_features = features_skewness_df[ features_skewness_df['S'] < -0.5]['F'] # Apply log for right-skewed features (skewness<-0.5) # Apply boxcox1p for left-skewed features (skewness>0.5) # boxcox1p(x,lmbda): # y = log(1+x) if lmbda==0 # y = ((1+x)**lmbda - 1) / lmbda if lmbda != 0 # the Box-Cox Power transformation only works if all the data is positive and greater than 0 for f in left_skewed_features: box_cox_coef = stats.boxcox_normmax(df_train[f] + 1) df_train[f] = special.boxcox1p(df_train[f], box_cox_coef) df_test[f] = special.boxcox1p(df_test[f], box_cox_coef)
def hello(): story = str(request.form['story']) area = request.form['area'] street = request.form['street'] utilities = request.form['utilities'] neighbor = request.form['neighbor'] bldgtype = request.form['bldgtype'] housestyle = request.form['housestyle'] quality = request.form['quality'] condition = str(request.form['condition']) year = request.form['year'] foundation = request.form['foundation'] garage = request.form['garage'] pool = request.form['pool'] pkl_file = open('Encoder.pkl', 'rb') lbl = pickle.load(pkl_file) pkl_file.close() test = [street,condition,pool,street] print(test) x = lbl.transform(test) street = x[0] condition = x[1] pool = x[2] street = x[3] area = boxcox1p(float(area), 0.15) + 1 story = boxcox1p(float(story), 0.15) + 1 condition = boxcox1p(float(condition), 0.15) + 1 quality = boxcox1p(float(quality), 0.15) + 1 year = boxcox1p(float(year), 0.15) + 1 test_data = np.asarray([[area,bldgtype,foundation,garage,housestyle,neighbor,story,condition,quality,pool,street,year]]) labelencoder_dict = joblib.load('labelencoder_dict.joblib') onehotencoder_dict = joblib.load('onehotencoder_dict.joblib') model = joblib.load('xgboost_model.joblib') encoded_data = None for i in range(0,test_data.shape[1]): if i in [1,2,3,4,5]: label_encoder = labelencoder_dict[i] feature = label_encoder.transform(test_data[:,i]) feature = feature.reshape(test_data.shape[0], 1) onehot_encoder = onehotencoder_dict[i] feature = onehot_encoder.transform(feature) else: feature = test_data[:,i].reshape(test_data.shape[0], 1) if encoded_data is None: encoded_data = feature else: encoded_data = np.concatenate((encoded_data, feature), axis=1) price = np.expm1(model.predict(encoded_data)) print(price[0]) #This is your answer variable = price[0]; return render_template("result.html", result = variable)
def test_boxcox1p_nonfinite(): # x < -1 => y = nan x = np.array([-2, -2, -1.5]) y = boxcox1p(x, [0.5, 2.0, -1.5]) yield assert_equal, y, np.array([np.nan, np.nan, np.nan]) # x = -1 and lambda <= 0 => y = -inf x = -1 y = boxcox1p(x, [-2.5, 0]) yield assert_equal, y, np.array([-np.inf, -np.inf])
def transform_continuous_by_boxcox(df, columns_to_treat=[], lambda_value=0.15, skewness_threshold=0.75): # For highly skewed features from scipy.special import boxcox1p from KUtils.stat import statil if len(columns_to_treat)==0: # Find columns yourself skewed_feet_df = statil.analyse_skew(df) skewed_feet_df = skewed_feet_df.loc[abs(skewed_feet_df['Skew']) > 0.75] skewness_to_treat = list(skewed_feet_df.index) for a_col in skewness_to_treat: print('Transforming {0} using boxcox1p'.format(a_col)) df[a_col] = boxcox1p(df[a_col], lambda_value) else: for a_col in columns_to_treat: df[a_col] = boxcox1p(df[a_col], lambda_value)
def numeric_transform(self, df): numeric_feats = [ col for col in df.columns if col != self.coly and df[col].dtypes != 'O' and col != 'train' ] # Check the skew of all numerical features skewed_feats = df[numeric_feats].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) print("\nSkew in numerical features: \n") skewness = pd.DataFrame({'Skew': skewed_feats}) skewness.head(10) skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform". format(skewness.shape[0])) skewed_features = skewness.index lam = 0.15 for feat in skewed_features: # all_data[feat] += 1 df[feat] = boxcox1p(df[feat], lam) # df[skewed_features] = np.log1p(df[skewed_features]) return df
def dampenSkew(df, num_features): skew_matrix = df[num_features].apply( lambda column: skew(column)).sort_values(ascending=False) skewed_features = skew_matrix[(abs(skew_matrix) > 1.0)].index for feature in skewed_features: df[feature] = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1)) return df
def transform(self, X, y=None): import numpy as np import pandas as pd from scipy.special import boxcox1p from scipy.stats import boxcox_normmax feature_select = X.columns.to_list() skewed_selected = list(set(self.numerical_features) & set(feature_select)) further = list(set(self.further_skewed) & set(feature_select)) special = list(set(self.special_skewed) & set(feature_select)) for feature in skewed_selected: X[feature] = boxcox1p(X[feature], 0.70) for feature in further: X[feature] = np.log1p(X[feature]) for feature in special: X[feature] = (X[feature])**2 return X
def __autoTransform(self, df, col): methods = ['log1p', 'sqrt', 'boxcox', 'boxcox1p'] # boxcox can work on positive value only if df[col].min() > 0: transformed = [ np.log1p(df[col]), np.sqrt(df[col]), boxcox(df[col])[0], boxcox1p(df[col], 0.15) ] # exclude boxcox transform if negative value present else: transformed = [np.log1p(df[col]), np.sqrt(df[col])] # calculate the skewness on each column skewness = list(map(lambda x: skew(x), transformed)) max_skew_idx = np.argmax(skewness) # check and record the skewness for each column if skewness[max_skew_idx] < self.skew_transform_[col]['after']: self.skew_transform_[col]['after'] = skewness[max_skew_idx] self.skew_transform_[col]['method'] = methods[max_skew_idx] return transformed[max_skew_idx] return df[col]
def trans_skewed(X, thresh=0.75, lmbda = 0): ''' Power or log transformation y = ((1+x)**lmbda - 1) / lmbda if lmbda != 0 log(1+x) if lmbda == 0 parameter: X : feature matrix thresh: threshold for skewness ''' start_time = time.time() skewness= find_skewness(X) skewness = skewness[abs(skewness) > thresh] print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0])) skewed_features = skewness.index for feat in skewed_features: #all_data[feat] += 1 X[feat] = boxcox1p(X[feat], lmbda) time_end =time.time() - start_time f = open(mlresult_dir +str(project_identifier) +"_log_feature_transformation.csv","a") f.write(sttime + '\n') f.write("\n Time taken to execute the 'transform skewed fetures' function is "+ str(time_end) + "\n" ) f.close() print("\n Time taken to execute the function is "+ str(time_end) + "\n" + "Feature having skewness more than threshold value are" + str(skewed_features)+ '\n') return X
def predict_dataset(data_frame, city, columns): """ Predicts prices of listings from a dataset :params data_frame dataframe: :params city str: :params columns list: :returns predictions array: """ x_var, y_var = cm.to_matrix(data_frame, columns) x_var = boxcox1p(x_var, 0.15) + 1 price_length = len(y_var) #imports models regressor_a = joblib.load(DATA_DIR + '/' + city + c.MODEL_1_SUFFIX) regressor_b = joblib.load(DATA_DIR + '/' + city + c.MODEL_2_SUFFIX) outlier_boundary = min(do.detect_outlier(y_var)) inbound = (y_var < outlier_boundary) outbound = (y_var >= outlier_boundary) #populates predictions depending on current price predictions = np.zeros(price_length) predictions[inbound] = np.expm1(regressor_b.predict(x_var[inbound])) predictions[outbound] = np.expm1(regressor_a.predict(x_var[outbound])) return predictions
def resolve_skewness(complete_df, numeric_features): # %% ~~~~~ Resolve skewness ~~~~ TODO camuffa codice from scipy.stats import skew skew_features = complete_df[numeric_features].apply( lambda x: skew(x)).sort_values(ascending=False) skews = pd.DataFrame({'skew': skew_features}) print() print('--------- SKEW OF FEATURES ----------') print(skew_features) print() from scipy.special import boxcox1p from scipy.stats import boxcox_normmax high_skew = skew_features[skew_features > 0.5] high_skew = high_skew skew_index = high_skew.index for i in skew_index: complete_df[i] = boxcox1p(complete_df[i], boxcox_normmax(complete_df[i] + 1)) # Check it is adjusted skew_features2 = complete_df[numeric_features].apply( lambda x: skew(x)).sort_values(ascending=False) skews2 = pd.DataFrame({'skew': skew_features2}) print() print('--------- SKEW OF FEATURES AFTER NORMALIZATION ----------') print(skew_features2) print() return complete_df
def get_process_skew_numeric_feature(data): numeric_feats = data.dtypes[data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = data[numeric_feats].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) print("\nSkew in numerical features: \n") skewness = pd.DataFrame({'Skew': skewed_feats}) skewness.head(10) # 将处理skew的特征 skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform".format( skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: # data[feat] += 1 data[feat] = boxcox1p(data[feat], lam) # data[skewed_features] = np.log1p(data[skewed_features]) return data
def skew_features(self, df, verbose=False): numeric_feats = df.dtypes[df.dtypes != "object"].index skewed_feats = df[numeric_feats].apply( lambda x: skew(x.dropna())).sort_values(ascending=False) skewness = pd.DataFrame({'Skew': skewed_feats}) if verbose: print("\nSkew in numerical features: \n") print(skewness.head(10)) skewness = skewness[abs(skewness) > 0.75] if verbose: print( "There are {} skewed numerical features to Box Cox transform". format(skewness.shape[0])) skewed_features = skewness.index lam = 0.15 for feat in skewed_features: #df[feat] += 1 df[feat] = boxcox1p(df[feat], lam) #df[skewed_features] = np.log1p(all_data[skewed_features]) return df
def features_engineer(X): X["TotalSF"] = X["GrLivArea"] + X["TotalBsmtSF"] X["TotalPorchSF"] = X["OpenPorchSF"] + X["EnclosedPorch"] + X[ "3SsnPorch"] + X["ScreenPorch"] X["TotalBath"] = X["FullBath"] + X["BsmtFullBath"] + 0.5 * ( X["BsmtHalfBath"] + X["HalfBath"]) cols = ["MSSubClass", "YrSold"] X[cols] = X[cols].astype("category") X["SinMoSold"] = np.sin(2 * np.pi * X["MoSold"] / 12) X["CosMoSold"] = np.cos(2 * np.pi * X["MoSold"] / 12) X = X.drop("MoSold", axis=1) skew = X.skew(numeric_only=True).abs() cols = skew[skew > 1].index for col in cols: X[col] = boxcox1p(X[col], boxcox_normmax(X[col] + 1)) cols = X.select_dtypes(np.number).columns X[cols] = RobustScaler().fit_transform(X[cols]) X = pd.get_dummies(X) X_train = X.loc[train.index] X_test = X.loc[test.index] return X_train, X_test
def boxcox_transform(self, X, y=None): X['AveRooms'] = X['AveRooms'].apply(lambda x: boxcox1p(x, 0.25)) X['AveBedrms'] = X['AveBedrms'].apply(lambda x: boxcox1p(x, 0.25)) X['HouseAge'] = X['HouseAge'].apply(lambda x: boxcox1p(x, 0.25)) X['Population'] = X['Population'].apply(lambda x: boxcox1p(x, 0.25)) X['AveOccup'] = X['AveOccup'].apply(lambda x: boxcox1p(x, 0.25)) X['Latitude'] = X['Latitude'].apply(lambda x: boxcox1p(x, 0.25)) X['MedInc'] = X['MedInc'].apply(lambda x: boxcox1p(x, 0.25)) # an offset is needed becouse the data is negative X['Longitude'] = X['Longitude'].apply( lambda x: boxcox1p(x + 125, 0.25)) X['Target'] = X['Target'].apply(lambda x: boxcox1p(x, 0.25)) return X
def do_transform(self, X, y = None): df = X.copy() transformable = self.skewness_df[abs(self.skewness_df.Skew) > self.transform_treshold].index logger.debug("There are |%d| skewed numerical features to Box Cox transform. These are: |%s|", transformable.shape[0], str(transformable)) for col in transformable: df[col] = boxcox1p(df[col], self.lamda) return df
def replace_skew(self, df): with open(SKEWED_HANDLING_COLUMNS, 'rb') as f: columns = pickle.load(f) from scipy.special import boxcox1p for c in columns: df["{}_skewed_fix".format(c)] = boxcox1p(df[c], 0.15) df.drop(c, axis=1, inplace=True) return df
def boxCoxTransform(train): # get numeric features numeric_feats = train.dtypes[train.dtypes != "object"].index # check skew skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) for feat in skewed_feats.index: train[feat] = boxcox1p(train[feat], lda) return train
def box_cox_map(self, col_need, gamma=1.0, col_replace=True): """ y = ((1+x)**gamma - 1) / gamma if gamma != 0 log(1+x) if gamma == 0 ref: http://onlinestatbook.com/2/transformations/box-cox.html :param col_need: :param gamma: :param col_replace: :return: """ df_need = self.df[col_need] if col_replace: self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma)) else: col_need_extend = [col + "_boxCox" for col in col_need] self.df[col_need_extend] = df_need.applymap( lambda x: boxcox1p(x, gamma))
def apply_boxcox(data, skew, lam = 0.15, debug = False): from scipy.special import boxcox1p skew = skew[abs(skew) > 0.75] if(debug == True): print("There are {} skewed numerical features to Box Cox transform".format(skew.shape[0])) skewed_features = skew.index for feat in skewed_features: data[feat] = boxcox1p(data[feat], lam)
def transform(self, df): df_copy = df.copy() skewed_features = self.columns_to_transform lam = 0.15 for feat in skewed_features: # all_data[feat] += 1 df_copy[feat] = boxcox1p(df_copy[feat], lam) return df_copy
def setUp(self): column_c1 = np.array([0, 2, np.nan, 0, 2, 2, 0, 2, 0]) column_c2 = np.array([0, 1, np.nan, 2, 3, 4, 5.0, 6, np.nan]) column_c3 = np.array([0, 1, 0, 0, 0, 0, 1, 0, np.nan]) column_c4 = np.array([0, 1, 0, 0, 0, 0, 1, 0, np.nan]) self.df = pd.DataFrame( OrderedDict((("column_c1", column_c1), ("column_c2", column_c2), ("column_c3", column_c3), ("column_c4", column_c4)))) columns_to_transform = ["column_c1", "column_c3"] self.boxCoxTransformer = BoxCoxTransformer(columns_to_transform) lam = 0.15 column_c1 = boxcox1p(column_c1, lam) column_c3 = boxcox1p(column_c3, lam) self.filled_df = pd.DataFrame( OrderedDict((("column_c1", column_c1), ("column_c2", column_c2), ("column_c3", column_c3), ("column_c4", column_c4))))
def box_cox(datacol, lam_min, lam_max, grain): lam_range = np.linspace(lam_min, lam_max, grain) llf = np.zeros(lam_range.shape, dtype=float) for i, lam in enumerate(lam_range): llf[i] = stats.boxcox_llf(lam, datacol) lam_best = lam_range[llf.argmax()] y = special.boxcox1p(datacol, lam_best) #return the transformed y and the best lamda return y, lam_best
def apply_box_cox_transform(dataset, column_names): """Apply a box-cox transgormation to every feature in column_names of dataset.""" from scipy.special import boxcox1p for column_name in column_names: transform = np.asarray(dataset[column_name].values) dataset[column_name] = boxcox1p(transform, 0) return dataset
def boxcox_transformation(skew_value,lam,dataset): skewness=skewness[abs(skewness)>0.75].dropna() print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewnessvalue.index for feat in skewed_features: dataset[feat] = boxcox1p(dataset[feat], lam) print('......') print('transform finished') return 0
def test_boxcox1p_basic(): x = np.array([-0.25, -1e-20, 0, 1e-20, 0.25, 1, 3]) # lambda = 0 => y = log(1+x) y = boxcox1p(x, 0) yield assert_almost_equal, y, np.log1p(x) # lambda = 1 => y = x y = boxcox1p(x, 1) yield assert_almost_equal, y, x # lambda = 2 => y = 0.5*((1+x)**2 - 1) = 0.5*x*(2 + x) y = boxcox1p(x, 2) yield assert_almost_equal, y, 0.5*x*(2 + x) # x = -1 and lambda > 0 => y = -1 / lambda lam = np.array([0.5, 1, 2]) y = boxcox1p(-1, lam) yield assert_almost_equal, y, -1.0 / lam
def fixing_skewness(data): ## Getting all the data that are not of "object" type. numeric_feats = data.dtypes[data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = data[numeric_feats].apply(lambda x: skew(x)).sort_values( ascending=False) high_skew = skewed_feats[abs(skewed_feats) > 0.5] skewed_features = high_skew.index for feat in skewed_features: data[feat] = boxcox1p(data[feat], boxcox_normmax(data[feat] + 1))
def test_inv_boxcox(): x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox(x, lam) x2 = inv_boxcox(y, lam) assert_almost_equal(x, x2) x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox1p(x, lam) x2 = inv_boxcox1p(y, lam) assert_almost_equal(x, x2)
def test_boxcox1p_underflow(): x = np.array([1e-15, 1e-306]) lmbda = np.array([1e-306, 1e-18]) y = boxcox1p(x, lmbda) assert_allclose(y, np.log1p(x), rtol=1e-14)
print(skewness.head(10)) # Box Cox Transformation of (highly) skewed features # We use the scipy function boxcox1p which computes the Box-Cox transformation of 1+x . # Note that setting λ=0 is equivalent to log1p used above for the target variable. skewness = skewness[abs(skewness) > 0.75] print("There are {} skewed numerical features to Box Cox transform".format( skewness.shape[0])) from scipy.special import boxcox1p skewed_features = skewness.index lam = 0.15 for feat in skewed_features: # all_data[feat] += 1 all_data[feat] = boxcox1p(all_data[feat], lam) # Getting dummy categorical features all_data = pd.get_dummies(all_data) print(all_data.shape) # Getting the new train and test sets. train = all_data[:ntrain] test = all_data[ntrain:] #Validation function n_folds = 5 def rmsle_cv(model): kf = KFold( n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(