예제 #1
0
def predict():
  
    if request.method == 'POST':
 
        try:
            
            if (float(request.form['OverallQuality']) >= 1 and float(request.form['OverallQuality']) <= 10 ):
                OverallQuality = float(request.form['OverallQuality'])
                OverallQuality_T = boxcox1p(OverallQuality , 0.15)
            else:
                OverallQuality_T = ""
                
            if (float(request.form['LivingArea']) <= 5600 and float(request.form['LivingArea']) >= 400):
                LivingArea = float(request.form['LivingArea'])
                LivingArea_T = boxcox1p(LivingArea , 0.15)
            else:
                LivingArea_T = ""
                
            if (float(request.form['GarageCars']) <= 4 and float(request.form['GarageCars']) >= 0  ):
                GarageCars = float(request.form['GarageCars'])
                GarageCars_T = boxcox1p(GarageCars, 0.15)
            else:
                GarageCars_T = ""
             
            with open('lgb_model.pkl', 'rb') as f:
                ml_model = pickle.load(f)
            pred_args = [OverallQuality_T, LivingArea_T, GarageCars_T]
            pred_args_arr = np.array(pred_args).reshape(1,-1)
  
            model_prediction =  ml_model.predict(pred_args_arr)
            model_prediction = np.expm1(model_prediction)
            model_prediction = round(float(model_prediction), 2)
        except ValueError:
            return "Please check if the values are in the range!"
    return render_template('predict.html', prediction = model_prediction)
예제 #2
0
def adjust_skewness(df: pd.DataFrame, specific: str = None) -> pd.DataFrame:
    """
    Adjusts the skewness of all columns by finding highly skewed columns
    and performing a boxcox transformation
    :param df: pandas DataFrame to adjust skewed columns in
    :return: pandas DataFrame with skew adjusted columns
    """

    numerics = list(x[0] for x in (
        filter(lambda x: x[1].name != 'object' and x[1].name != 'category',
               zip(df.columns, df.dtypes))))
    skewed_feats = df[numerics].apply(lambda x: skew(x.dropna())).sort_values(
        ascending=False)
    skewness = pd.DataFrame({'Skew': skewed_feats})
    skewness = skewness[abs(skewness) > 0.7]
    skewed_features = skewness.index
    if specific:
        skewed_features = [specific]
    lam = 0.15

    for feat in skewed_features:
        boxcot_trans = boxcox1p(df[feat], lam)
        if not boxcot_trans.isnull().any():
            df[feat] = boxcox1p(df[feat], lam)

    return df
예제 #3
0
    def _transform_skewed_features(self, numerical_vars: np.ndarray) -> None:
        """
        Private method for transforming features with high skew.

        :param numerical_vars: Set of all originally numerical variables.
        """

        logging.info(
            f'#{self._index()} - Determine and transform skewed features...')

        # check the skew of all numerical features
        skewed_features = self.X[numerical_vars].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)
        logging.info("Skew in numerical features: \n")
        skewness = pd.DataFrame({'Skew': skewed_features})
        logging.info(skewness)

        # transform skewed features
        skewed_features = skewness[abs(skewness.Skew) > 0.75].index
        logging.info(f"There are {skewed_features.size} skewed features")

        for feature in skewed_features:
            self.X[feature] = boxcox1p(self.X[feature], box_cox_lambda)
            self.X_test[feature] = boxcox1p(self.X_test[feature],
                                            box_cox_lambda)

        # check the skew of all numerical features again
        skewed_features = self.X[numerical_vars].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)
        logging.info("Skew in numerical features: \n")
        skewness = pd.DataFrame({'Skew': skewed_features})
        logging.info(skewness)
        logging.info(f'#{self._step_index} - DONE!')
예제 #4
0
def transform_numerical_features(df_train, df_test):
    """
    TODO currently deals with positive skewed features, not negative. Analyse this.
    :param df_train:
    :param df_test:
    :return:
    """
    # apply log, scaling features
    # SalePrice. Check log against log(1+x), log1p
    df_train['SalePrice'] = np.log1p(df_train['SalePrice'])
    # Check for skewed features
    features_skewness = []
    for k in df_train.columns:
        if k == 'SalePrice':
            pass
        elif df_train.dtypes[k] != object:
            features_skewness.append([k, np.float(stats.skew(df_train[k]))])
    features_skewness_df = pd.DataFrame(features_skewness,
                                        columns=['F', 'S']).sort_values(by='S')
    left_skewed_features = features_skewness_df[
        features_skewness_df['S'] > 0.5]['F']
    right_skewed_features = features_skewness_df[
        features_skewness_df['S'] < -0.5]['F']
    # Apply log for right-skewed features (skewness<-0.5)
    # Apply boxcox1p for left-skewed features (skewness>0.5)
    #     boxcox1p(x,lmbda):
    #       y = log(1+x) if lmbda==0
    #       y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
    # the Box-Cox Power transformation only works if all the data is positive and greater than 0
    for f in left_skewed_features:
        box_cox_coef = stats.boxcox_normmax(df_train[f] + 1)
        df_train[f] = special.boxcox1p(df_train[f], box_cox_coef)
        df_test[f] = special.boxcox1p(df_test[f], box_cox_coef)
예제 #5
0
def hello():
	story = str(request.form['story'])
	area = request.form['area']
	street = request.form['street']
	utilities = request.form['utilities']
	neighbor = request.form['neighbor']
	bldgtype = request.form['bldgtype']
	housestyle = request.form['housestyle']
	quality = request.form['quality']
	condition = str(request.form['condition'])
	year = request.form['year']
	foundation = request.form['foundation']
	garage = request.form['garage']
	pool = request.form['pool']

	pkl_file = open('Encoder.pkl', 'rb')
	lbl = pickle.load(pkl_file)
	pkl_file.close()
	
	test = [street,condition,pool,street]
	print(test)
	x = lbl.transform(test)
	street = x[0]
	condition = x[1]
	pool = x[2]
	street = x[3]
	
	area = boxcox1p(float(area), 0.15) + 1
	story = boxcox1p(float(story), 0.15) + 1 
	condition = boxcox1p(float(condition), 0.15) + 1
	quality = boxcox1p(float(quality), 0.15) + 1
	year = boxcox1p(float(year), 0.15) + 1

	test_data = np.asarray([[area,bldgtype,foundation,garage,housestyle,neighbor,story,condition,quality,pool,street,year]])

	labelencoder_dict = joblib.load('labelencoder_dict.joblib')
	onehotencoder_dict = joblib.load('onehotencoder_dict.joblib')
	model = joblib.load('xgboost_model.joblib')
	encoded_data = None
	for i in range(0,test_data.shape[1]):
		if i in [1,2,3,4,5]:
			label_encoder =  labelencoder_dict[i]
			feature = label_encoder.transform(test_data[:,i])
			feature = feature.reshape(test_data.shape[0], 1)
			onehot_encoder = onehotencoder_dict[i]
			feature = onehot_encoder.transform(feature)
		else:
			feature = test_data[:,i].reshape(test_data.shape[0], 1)
		if encoded_data is None:
			encoded_data = feature
		else:
			encoded_data = np.concatenate((encoded_data, feature), axis=1)
	
	price = np.expm1(model.predict(encoded_data))
	print(price[0]) #This is your answer
	
	variable = price[0];
	return render_template("result.html", result = variable)
예제 #6
0
def test_boxcox1p_nonfinite():
    # x < -1  =>  y = nan
    x = np.array([-2, -2, -1.5])
    y = boxcox1p(x, [0.5, 2.0, -1.5])
    yield assert_equal, y, np.array([np.nan, np.nan, np.nan])

    # x = -1 and lambda <= 0  =>  y = -inf
    x = -1
    y = boxcox1p(x, [-2.5, 0])
    yield assert_equal, y, np.array([-np.inf, -np.inf])
예제 #7
0
def test_boxcox1p_nonfinite():
    # x < -1  =>  y = nan
    x = np.array([-2, -2, -1.5])
    y = boxcox1p(x, [0.5, 2.0, -1.5])
    yield assert_equal, y, np.array([np.nan, np.nan, np.nan])

    # x = -1 and lambda <= 0  =>  y = -inf
    x = -1
    y = boxcox1p(x, [-2.5, 0])
    yield assert_equal, y, np.array([-np.inf, -np.inf])
def transform_continuous_by_boxcox(df, columns_to_treat=[], lambda_value=0.15, skewness_threshold=0.75): # For highly skewed features
    from scipy.special import boxcox1p
    from KUtils.stat import statil
    if len(columns_to_treat)==0: # Find columns yourself
        skewed_feet_df = statil.analyse_skew(df)
        skewed_feet_df = skewed_feet_df.loc[abs(skewed_feet_df['Skew']) > 0.75]
        skewness_to_treat = list(skewed_feet_df.index)
        for a_col in skewness_to_treat:
            print('Transforming {0} using boxcox1p'.format(a_col))
            df[a_col] = boxcox1p(df[a_col], lambda_value)
    else:
        for a_col in columns_to_treat:
            df[a_col] = boxcox1p(df[a_col], lambda_value)
예제 #9
0
    def numeric_transform(self, df):
        numeric_feats = [
            col for col in df.columns
            if col != self.coly and df[col].dtypes != 'O' and col != 'train'
        ]

        # Check the skew of all numerical features
        skewed_feats = df[numeric_feats].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)
        print("\nSkew in numerical features: \n")
        skewness = pd.DataFrame({'Skew': skewed_feats})
        skewness.head(10)

        skewness = skewness[abs(skewness) > 0.75]
        print("There are {} skewed numerical features to Box Cox transform".
              format(skewness.shape[0]))

        skewed_features = skewness.index
        lam = 0.15
        for feat in skewed_features:
            # all_data[feat] += 1
            df[feat] = boxcox1p(df[feat], lam)

            # df[skewed_features] = np.log1p(df[skewed_features])
        return df
예제 #10
0
def dampenSkew(df, num_features):
    skew_matrix = df[num_features].apply(
        lambda column: skew(column)).sort_values(ascending=False)
    skewed_features = skew_matrix[(abs(skew_matrix) > 1.0)].index
    for feature in skewed_features:
        df[feature] = boxcox1p(df[feature], boxcox_normmax(df[feature] + 1))
    return df
    def transform(self, X, y=None):  
        import numpy as np
        import pandas as pd
        

        from scipy.special import boxcox1p
        from scipy.stats import boxcox_normmax
        
        
        feature_select = X.columns.to_list()
        skewed_selected = list(set(self.numerical_features) & set(feature_select))
        
        further = list(set(self.further_skewed) & set(feature_select))
        special = list(set(self.special_skewed) & set(feature_select))
        
        for feature in skewed_selected:
            X[feature] = boxcox1p(X[feature], 0.70)
            
        for feature in further:
            X[feature] = np.log1p(X[feature])
            
        for feature in special:
            X[feature] = (X[feature])**2
        
            
        return X
            
            
        
예제 #12
0
    def __autoTransform(self, df, col):
        methods = ['log1p', 'sqrt', 'boxcox', 'boxcox1p']

        # boxcox can work on positive value only
        if df[col].min() > 0:
            transformed = [
                np.log1p(df[col]),
                np.sqrt(df[col]),
                boxcox(df[col])[0],
                boxcox1p(df[col], 0.15)
            ]

        # exclude boxcox transform if negative value present
        else:
            transformed = [np.log1p(df[col]), np.sqrt(df[col])]

        # calculate the skewness on each column
        skewness = list(map(lambda x: skew(x), transformed))
        max_skew_idx = np.argmax(skewness)

        # check and record the skewness for each column
        if skewness[max_skew_idx] < self.skew_transform_[col]['after']:
            self.skew_transform_[col]['after'] = skewness[max_skew_idx]
            self.skew_transform_[col]['method'] = methods[max_skew_idx]

            return transformed[max_skew_idx]

        return df[col]
예제 #13
0
def trans_skewed(X, thresh=0.75, lmbda = 0):
    '''
    Power or log transformation
    y = ((1+x)**lmbda - 1) / lmbda  if lmbda != 0
    log(1+x)                    if lmbda == 0
    parameter: 
    X : feature matrix
    thresh: threshold for skewness
    '''
    start_time          = time.time()
    skewness= find_skewness(X)
    skewness = skewness[abs(skewness) > thresh]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
    skewed_features = skewness.index
    
    for feat in skewed_features:
        #all_data[feat] += 1
        X[feat] = boxcox1p(X[feat], lmbda)
    time_end            =time.time() - start_time
    f = open(mlresult_dir +str(project_identifier) +"_log_feature_transformation.csv","a")
    f.write(sttime + '\n')
    f.write("\n Time taken to execute the 'transform skewed fetures' function is "+ str(time_end) + "\n" )
    f.close()
    print("\n Time taken to execute the function is "+ str(time_end) + "\n" + "Feature having skewness more than threshold value are" + str(skewed_features)+ '\n')
    return X
예제 #14
0
def predict_dataset(data_frame, city, columns):
    """
    Predicts prices of listings from a dataset
    :params data_frame dataframe:
    :params city str:
    :params columns list:
    :returns predictions array:
    """
    x_var, y_var = cm.to_matrix(data_frame, columns)
    x_var = boxcox1p(x_var, 0.15) + 1
    price_length = len(y_var)

    #imports models
    regressor_a = joblib.load(DATA_DIR + '/' + city + c.MODEL_1_SUFFIX)
    regressor_b = joblib.load(DATA_DIR + '/' + city + c.MODEL_2_SUFFIX)

    outlier_boundary = min(do.detect_outlier(y_var))
    inbound = (y_var < outlier_boundary)
    outbound = (y_var >= outlier_boundary)

    #populates predictions depending on current price
    predictions = np.zeros(price_length)

    predictions[inbound] = np.expm1(regressor_b.predict(x_var[inbound]))
    predictions[outbound] = np.expm1(regressor_a.predict(x_var[outbound]))

    return predictions
def resolve_skewness(complete_df, numeric_features):
    # %% ~~~~~ Resolve skewness ~~~~ TODO camuffa codice
    from scipy.stats import skew

    skew_features = complete_df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews = pd.DataFrame({'skew': skew_features})

    print()
    print('--------- SKEW OF FEATURES ----------')
    print(skew_features)
    print()

    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax

    high_skew = skew_features[skew_features > 0.5]
    high_skew = high_skew
    skew_index = high_skew.index

    for i in skew_index:
        complete_df[i] = boxcox1p(complete_df[i],
                                  boxcox_normmax(complete_df[i] + 1))

    # Check it is adjusted
    skew_features2 = complete_df[numeric_features].apply(
        lambda x: skew(x)).sort_values(ascending=False)
    skews2 = pd.DataFrame({'skew': skew_features2})
    print()
    print('--------- SKEW OF FEATURES AFTER NORMALIZATION ----------')
    print(skew_features2)
    print()
    return complete_df
예제 #16
0
def get_process_skew_numeric_feature(data):
    numeric_feats = data.dtypes[data.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = data[numeric_feats].apply(
        lambda x: skew(x.dropna())).sort_values(ascending=False)
    print("\nSkew in numerical features: \n")
    skewness = pd.DataFrame({'Skew': skewed_feats})
    skewness.head(10)

    # 将处理skew的特征
    skewness = skewness[abs(skewness) > 0.75]
    print("There are {} skewed numerical features to Box Cox transform".format(
        skewness.shape[0]))

    from scipy.special import boxcox1p

    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        # data[feat] += 1
        data[feat] = boxcox1p(data[feat], lam)

    # data[skewed_features] = np.log1p(data[skewed_features])
    return data
예제 #17
0
    def skew_features(self, df, verbose=False):
        numeric_feats = df.dtypes[df.dtypes != "object"].index
        skewed_feats = df[numeric_feats].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)

        skewness = pd.DataFrame({'Skew': skewed_feats})

        if verbose:
            print("\nSkew in numerical features: \n")
            print(skewness.head(10))

        skewness = skewness[abs(skewness) > 0.75]

        if verbose:
            print(
                "There are {} skewed numerical features to Box Cox transform".
                format(skewness.shape[0]))

        skewed_features = skewness.index
        lam = 0.15

        for feat in skewed_features:
            #df[feat] += 1
            df[feat] = boxcox1p(df[feat], lam)

        #df[skewed_features] = np.log1p(all_data[skewed_features])

        return df
예제 #18
0
def features_engineer(X):
    X["TotalSF"] = X["GrLivArea"] + X["TotalBsmtSF"]
    X["TotalPorchSF"] = X["OpenPorchSF"] + X["EnclosedPorch"] + X[
        "3SsnPorch"] + X["ScreenPorch"]
    X["TotalBath"] = X["FullBath"] + X["BsmtFullBath"] + 0.5 * (
        X["BsmtHalfBath"] + X["HalfBath"])

    cols = ["MSSubClass", "YrSold"]
    X[cols] = X[cols].astype("category")

    X["SinMoSold"] = np.sin(2 * np.pi * X["MoSold"] / 12)
    X["CosMoSold"] = np.cos(2 * np.pi * X["MoSold"] / 12)
    X = X.drop("MoSold", axis=1)

    skew = X.skew(numeric_only=True).abs()
    cols = skew[skew > 1].index
    for col in cols:
        X[col] = boxcox1p(X[col], boxcox_normmax(X[col] + 1))

    cols = X.select_dtypes(np.number).columns
    X[cols] = RobustScaler().fit_transform(X[cols])

    X = pd.get_dummies(X)

    X_train = X.loc[train.index]
    X_test = X.loc[test.index]
    return X_train, X_test
예제 #19
0
 def boxcox_transform(self, X, y=None):
     X['AveRooms'] = X['AveRooms'].apply(lambda x: boxcox1p(x, 0.25))
     X['AveBedrms'] = X['AveBedrms'].apply(lambda x: boxcox1p(x, 0.25))
     X['HouseAge'] = X['HouseAge'].apply(lambda x: boxcox1p(x, 0.25))
     X['Population'] = X['Population'].apply(lambda x: boxcox1p(x, 0.25))
     X['AveOccup'] = X['AveOccup'].apply(lambda x: boxcox1p(x, 0.25))
     X['Latitude'] = X['Latitude'].apply(lambda x: boxcox1p(x, 0.25))
     X['MedInc'] = X['MedInc'].apply(lambda x: boxcox1p(x, 0.25))
     # an offset is needed becouse the data is negative
     X['Longitude'] = X['Longitude'].apply(
         lambda x: boxcox1p(x + 125, 0.25))
     X['Target'] = X['Target'].apply(lambda x: boxcox1p(x, 0.25))
     return X
예제 #20
0
    def do_transform(self, X, y = None):
        df = X.copy()
        transformable = self.skewness_df[abs(self.skewness_df.Skew) > self.transform_treshold].index
        logger.debug("There are |%d| skewed numerical features to Box Cox transform. These are: |%s|", transformable.shape[0], str(transformable))
        for col in transformable:
            df[col] = boxcox1p(df[col], self.lamda) 

        return df
예제 #21
0
 def replace_skew(self, df):
     with open(SKEWED_HANDLING_COLUMNS, 'rb') as f:
         columns = pickle.load(f)
     from scipy.special import boxcox1p
     for c in columns:
         df["{}_skewed_fix".format(c)] = boxcox1p(df[c], 0.15)
         df.drop(c, axis=1, inplace=True)
     return df
예제 #22
0
def boxCoxTransform(train):
    # get numeric features
    numeric_feats = train.dtypes[train.dtypes != "object"].index
    # check skew
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    for feat in skewed_feats.index:
        train[feat] = boxcox1p(train[feat], lda)
    return train
 def box_cox_map(self, col_need, gamma=1.0, col_replace=True):
     """
     y = ((1+x)**gamma - 1) / gamma  if gamma != 0
         log(1+x)                    if gamma == 0
     ref: http://onlinestatbook.com/2/transformations/box-cox.html
     :param col_need:
     :param gamma:
     :param col_replace:
     :return:
     """
     df_need = self.df[col_need]
     if col_replace:
         self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma))
     else:
         col_need_extend = [col + "_boxCox" for col in col_need]
         self.df[col_need_extend] = df_need.applymap(
             lambda x: boxcox1p(x, gamma))
예제 #24
0
def apply_boxcox(data, skew, lam = 0.15, debug = False):
    from scipy.special import boxcox1p
    
    skew = skew[abs(skew) > 0.75]
    if(debug == True):
        print("There are {} skewed numerical features to Box Cox transform".format(skew.shape[0]))
    skewed_features = skew.index
    for feat in skewed_features:
        data[feat] = boxcox1p(data[feat], lam)
    def transform(self, df):
        df_copy = df.copy()
        skewed_features = self.columns_to_transform
        lam = 0.15
        for feat in skewed_features:
            # all_data[feat] += 1
            df_copy[feat] = boxcox1p(df_copy[feat], lam)

        return df_copy
    def setUp(self):
        column_c1 = np.array([0, 2, np.nan, 0, 2, 2, 0, 2, 0])
        column_c2 = np.array([0, 1, np.nan, 2, 3, 4, 5.0, 6, np.nan])
        column_c3 = np.array([0, 1, 0, 0, 0, 0, 1, 0, np.nan])
        column_c4 = np.array([0, 1, 0, 0, 0, 0, 1, 0, np.nan])
        self.df = pd.DataFrame(
            OrderedDict((("column_c1", column_c1), ("column_c2", column_c2),
                         ("column_c3", column_c3), ("column_c4", column_c4))))
        columns_to_transform = ["column_c1", "column_c3"]
        self.boxCoxTransformer = BoxCoxTransformer(columns_to_transform)

        lam = 0.15
        column_c1 = boxcox1p(column_c1, lam)
        column_c3 = boxcox1p(column_c3, lam)

        self.filled_df = pd.DataFrame(
            OrderedDict((("column_c1", column_c1), ("column_c2", column_c2),
                         ("column_c3", column_c3), ("column_c4", column_c4))))
예제 #27
0
def box_cox(datacol, lam_min, lam_max, grain):
    lam_range = np.linspace(lam_min, lam_max, grain)
    llf = np.zeros(lam_range.shape, dtype=float)
    for i, lam in enumerate(lam_range):
        llf[i] = stats.boxcox_llf(lam, datacol)
    lam_best = lam_range[llf.argmax()]
    y = special.boxcox1p(datacol, lam_best)
    #return the transformed y and the best lamda
    return y, lam_best
예제 #28
0
def apply_box_cox_transform(dataset, column_names):
    """Apply a box-cox transgormation to every feature in column_names of dataset."""
    from scipy.special import boxcox1p

    for column_name in column_names:
        transform = np.asarray(dataset[column_name].values)
        dataset[column_name] = boxcox1p(transform, 0)

    return dataset
예제 #29
0
def boxcox_transformation(skew_value,lam,dataset):
    skewness=skewness[abs(skewness)>0.75].dropna()
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
    from scipy.special import boxcox1p
    skewed_features = skewnessvalue.index
    for feat in skewed_features:
        dataset[feat] = boxcox1p(dataset[feat], lam)
    print('......')
    print('transform finished')
    return 0
예제 #30
0
def test_boxcox1p_basic():
    x = np.array([-0.25, -1e-20, 0, 1e-20, 0.25, 1, 3])

    # lambda = 0  =>  y = log(1+x)
    y = boxcox1p(x, 0)
    yield assert_almost_equal, y, np.log1p(x)

    # lambda = 1  =>  y = x
    y = boxcox1p(x, 1)
    yield assert_almost_equal, y, x

    # lambda = 2  =>  y = 0.5*((1+x)**2 - 1) = 0.5*x*(2 + x)
    y = boxcox1p(x, 2)
    yield assert_almost_equal, y, 0.5*x*(2 + x)

    # x = -1 and lambda > 0  =>  y = -1 / lambda
    lam = np.array([0.5, 1, 2])
    y = boxcox1p(-1, lam)
    yield assert_almost_equal, y, -1.0 / lam
예제 #31
0
def test_boxcox1p_basic():
    x = np.array([-0.25, -1e-20, 0, 1e-20, 0.25, 1, 3])

    # lambda = 0  =>  y = log(1+x)
    y = boxcox1p(x, 0)
    yield assert_almost_equal, y, np.log1p(x)

    # lambda = 1  =>  y = x
    y = boxcox1p(x, 1)
    yield assert_almost_equal, y, x

    # lambda = 2  =>  y = 0.5*((1+x)**2 - 1) = 0.5*x*(2 + x)
    y = boxcox1p(x, 2)
    yield assert_almost_equal, y, 0.5*x*(2 + x)

    # x = -1 and lambda > 0  =>  y = -1 / lambda
    lam = np.array([0.5, 1, 2])
    y = boxcox1p(-1, lam)
    yield assert_almost_equal, y, -1.0 / lam
예제 #32
0
def fixing_skewness(data):
    ## Getting all the data that are not of "object" type.
    numeric_feats = data.dtypes[data.dtypes != "object"].index
    # Check the skew of all numerical features
    skewed_feats = data[numeric_feats].apply(lambda x: skew(x)).sort_values(
        ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index
    for feat in skewed_features:
        data[feat] = boxcox1p(data[feat], boxcox_normmax(data[feat] + 1))
예제 #33
0
def test_inv_boxcox():
    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox(x, lam)
    x2 = inv_boxcox(y, lam)
    assert_almost_equal(x, x2)

    x = np.array([0., 1., 2.])
    lam = np.array([0., 1., 2.])
    y = boxcox1p(x, lam)
    x2 = inv_boxcox1p(y, lam)
    assert_almost_equal(x, x2)
예제 #34
0
def test_boxcox1p_underflow():
    x = np.array([1e-15, 1e-306])
    lmbda = np.array([1e-306, 1e-18])
    y = boxcox1p(x, lmbda)
    assert_allclose(y, np.log1p(x), rtol=1e-14)
예제 #35
0
print(skewness.head(10))

# Box Cox Transformation of (highly) skewed features
# We use the scipy function boxcox1p which computes the Box-Cox transformation of  1+x .
# Note that setting  λ=0  is equivalent to log1p used above for the target variable.
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(
    skewness.shape[0]))

from scipy.special import boxcox1p

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    # all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)
# Getting dummy categorical features
all_data = pd.get_dummies(all_data)
print(all_data.shape)
# Getting the new train and test sets.
train = all_data[:ntrain]
test = all_data[ntrain:]

#Validation function
n_folds = 5


def rmsle_cv(model):
    kf = KFold(
        n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(