示例#1
0
def transform_data(data, transform='log'):
    if transform == 'log':
        data = np.log(data)
    elif transform == 'box-cox':
        data = pd.DataFrame(power_transform(data, method='box-cox'), columns = data.columns)
    elif transform =='yeo-johnson':
        data = pd.DataFrame(power_transform(data, method='yeo-johnson'), columns = data.columns)
    else:
        pass 
    return data  
示例#2
0
    def transform(self, X, **kwargs):

        data = X.copy()
        for col in self.cols:
            data[col] = power_transform(data[[col]])

        return data
示例#3
0
def log_power_transform(df: pd.DataFrame,
                        method: str = 'box-cox',
                        standardize: bool = False) -> Tuple[pd.DataFrame, str]:
    """
    Perform log or power transform of the input dataframe. If performing a
    power transformation, user has option to standardize afterwards.

    Parameters
    ----------
    df : The pd.DataFrame to be normalized. Can be either univariate or multivariate.
    method : The type of log/power transformation used. Current options are
             'box-cox', 'yeo-johnson', or 'log'.
    standardize : The option to standardize data after transformation. The default is False.

    Returns
    -------
    df_trans : The transformed dataframe.
    title : The key used to access df_pwr in CheckpointDict during run_package().

    """
    stan = ''
    if standardize:
        stan = 'Standardized'
    if method == 'log':
        df_trans = df.transform(np.log)
    else:
        data_trans = power_transform(df,
                                     method=method,
                                     standardize=standardize)
        df_trans = pd.DataFrame(data_trans, index=df.index, columns=df.columns)

    title = str(method.title() + ' ' + stan)
    return df_trans, title
示例#4
0
def num_yeo(X, cols, predix='yeo_', **params):
    # Yeo-Johnson Transformer
    params['method'] = 'yeo-johnson'
    _X = X.copy()
    _cols = [prefix + col for col in cols]
    _x = power_transform(X[cols])
    _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index)
    return _X, _cols
示例#5
0
def num_boxcox(X, cols, predix='bc_', **params):
    # Box-Cox Transformer
    params['method'] = 'box-cox'
    _X = X.copy()
    _cols = [prefix + col for col in cols]
    _x = power_transform(X[cols])
    _X[_cols] = pd.DataFrame(_x, columns=_cols, index=_X.index)
    return _X, _cols
    def ts_scaler(self, object_id):
        test = self.ts_df.drop('mean_detected', axis=1).loc[object_id]

        # object_id_set = set(object_id_list)
        # scaled_df_v3 = pd.DataFrame()
        # for object_id in object_id_set:
        # test = scaler_df.loc[object_id]
        passband_set = set(test.index.get_level_values('passband'))

        scaled_df_v2 = pd.DataFrame()
        for passband in passband_set:
            tester = test.xs(passband, level='passband')
            scaled_df = pd.DataFrame()
            for i, column in enumerate(tester):

                # print(i, column)
                pd_series = tester[column]
                series_index = pd_series.index.tolist()

                if bool(re.search("range", column)):

                    # maxabs_scaler = preprocessing.MaxAbsScaler()
                    x = pd_series.values.reshape(-1,
                                                 1)  # returns a numpy array
                    x_scaled = preprocessing.maxabs_scale(x)
                    x_scaled = pd.Series(x_scaled[:, 0], index=series_index)

                else:

                    # power_scaler = preprocessing.PowerTransformer()
                    x = pd_series.values.reshape(-1,
                                                 1)  # returns a numpy array
                    try:
                        x_scaled = preprocessing.power_transform(
                            x, method='yeo-johnson')
                        x_scaled = pd.Series(x_scaled[:, 0],
                                             index=series_index)
                    except RuntimeWarning:
                        print(object_id, passband, column)

                scaled_df = pd.concat((scaled_df, x_scaled), axis=1)

            scaled_df.index.names = ['input']
            scaled_df.columns = [x for x in tester.columns]
            scaled_df['passband'] = passband
            scaled_df.set_index('passband', append=True, inplace=True)
            scaled_df_v2 = pd.concat((scaled_df_v2, scaled_df), axis=0)
            scaled_df_v2 = scaled_df_v2.reorder_levels(['input', 'passband'])

        scaled_df_v2['object_id'] = object_id
        scaled_df_v2.set_index('object_id', append=True, inplace=True)
        scaled_df_v2 = scaled_df_v2.reorder_levels(
            ['object_id', 'input', 'passband'])
        # scaled_df_v3 = pd.concat((scaled_df_v3, scaled_df_v2), axis=0)

        final_scaled_df = pd.concat(
            (scaled_df_v2, self.ts_df[['mean_detected']]), axis=1, sort=False)
        return final_scaled_df
示例#7
0
    def train(self):
        #Get a dataset. This is Microsoft stock data.
        df = pm.datasets.load_msft()
        df = df.drop(columns=['Date', 'Volume', 'OpenInt'])

        #Dataset shape is now (7983,4)
        print(df.shape)

        #define the series to be forecasted (user specified)
        y = df['High']
        y = np.array(y)
        y = y.reshape(-1, 1)

        #exog represents the exogeneous variables (user specified)
        exog = df[['Open', 'Low', 'Close']]
        exog = np.array(exog)

        #Box-Cox transform on y and exog
        y = power_transform(y, method='box-cox')
        exog = power_transform(exog, method='box-cox')

        y_train, y_test = pm.model_selection.train_test_split(y, test_size=0.2)
        exog_train, exog_test = pm.model_selection.train_test_split(
            exog, test_size=0.2)

        arima = pm.auto_arima(y_train,
                              exog_train,
                              start_p=1,
                              d=None,
                              start_q=1,
                              information_criterion='aic',
                              maxiter=100,
                              method='lbfgs',
                              test='kpss',
                              stepwise=True)

        forecasts = arima.predict(y_test.shape[0], exog_test)

        error = smape(y_test, forecasts)
        mae = mean_absolute_error(y_test, forecasts)
        print("Symmetric Mean Absolute Percentage Error: ", error)
        print("Mean Absolute Error: ", mae)
示例#8
0
    def transform(self, df):
        df = df.copy()

        for col in self.to_transform:
            if self.how == 'log':
                df[col] = np.log(1 + df[col])
            elif self.how == 'yj':
                df[col] = skl_preproc.power_transform(
                    df[col].values.reshape(-1, 1),
                    method='yeo-johnson',
                    standardize=self.standardize)
            elif self.how == 'boxcox':
                df[col] = skl_preproc.power_transform(
                    df[col].values.reshape(-1, 1),
                    method='box-cox',
                    standardize=self.standardize)
            elif self.how == 'boxcox1p':
                df[col] = skl_preproc.power_transform(
                    1 + df[col].values.reshape(-1, 1),
                    method='box-cox',
                    standardize=self.standardize)

        return df
示例#9
0
def newBoxCoxTranformation(df, target):

    #assuming that only numerical features are presented
    print("Shape of the dataset before transformation : ", df.shape)

    y = df[target].apply(lambda x: math.log(x))
    X = df.drop(target, axis=1)
    x_columns = list(X)
    X = preprocessing.MinMaxScaler(feature_range=(1, 2)).fit_transform(X)
    X = preprocessing.power_transform(X, method='box-cox')
    #X = pd.DataFrame(X,columns=x_columns)
    print("Shape of the dataset after transformation : ", X.shape, y.shape)

    return X, y
def normalize(dataframe, type):
    global normalized_data
    if type == 'zscore':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        normalized_data = cleaner_data.apply(zscore)
    elif type == 'minmax':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        minmax_data = minmax_scale(cleaner_data)
        normalized_data = pd.DataFrame(minmax_data)
    elif type == 'l1_norm':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        norm_data = normalize(cleaner_data, norm='l1')
        normalized_data = pd.DataFrame(norm_data)
    elif type == 'l2_norm':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        norm_data = normalize(cleaner_data, norm='l2')
        normalized_data = pd.DataFrame(norm_data)
    elif type == 'power_yeo':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        power_data = power_transform(cleaner_data, method='yeo-johnson')
        normalized_data = pd.DataFrame(power_data)
    elif type == 'power_box':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        power_data = power_transform(cleaner_data, method='box-cox')
        normalized_data = pd.DataFrame(power_data)
    elif type == 'quantile':
        clean_data = dataframe.select_dtypes(['number'])
        cleaner_data = clean_data.dropna(how='any')
        quantile_data = quantile_transform(cleaner_data)
        normalized_data = pd.DataFrame(quantile_data)
    return normalized_data
def pow_transformer(df , column_names):
    '''This function takes in a dataframe
    and the columns that need to power 
    scaled. Loops through the columns 
    power transforms the continuous 
    variables.'''
    dataframe = []
    copy_df = df.copy()
    for column in column_names:
        new_df = power_transform(np.array(df[column]).reshape(-1,1))
        new_df.columns = [column + '_' + str(name) for name in new_df]
        dataframes.append(new_df)
        copy_df.drop(column, axis=1, inplace=True)
    new_df = pd.concat(dataframes, axis=1)
    return pd.concat([copy_df, new_df], axis=1)
示例#12
0
def main():
    data_path = r'C:\Users\win10\Desktop\Projects\CYB\Experiment_Balint\CYB004\Data'
    n_channels = 8
    X = np.empty((n_channels, 0))
    for file in sorted([f for f in os.listdir(data_path) if f.endswith('.json')]):
        if 'Stair' not in file:
            continue
        with open(data_path + '\\' + file) as json_file:
            dict_data = json.load(json_file)
        emg_data = np.array(dict_data["EMG"])
        X = np.hstack((X, emg_data))
    X_std = np.std(X, axis=1)
    X_mean = np.mean(X, axis=1)
    X = (X - X_mean[:, None]) / X_std[:, None]
    a = power_transform(np.expand_dims(np.abs(X[0]),0).T, method='box-cox')
    nProcess = multiprocessing.cpu_count()
    with multiprocessing.Pool(nProcess) as pool:
        lambdas = pool.map(parallel_proc, X)

    with open(data_path + r'\lambdas.csv', "w", newline='') as f:
        writer = csv.writer(f)
        writer.writerows(lambdas)

models = [clf_bag, clf_log]

get_split_loader = get_split_loader_func(3, X)

#evaluate(models, [tr], X, y, get_split_loader)

from sklearn.preprocessing import power_transform

    
nor_dis = TurnToNormDist(cols_with_nan)

new_X = nor_dis.transform(X)
new_X.shape==X.shape
power_transform(X[["sector"]])
X.head()
import seaborn as sns
sns.distplot( new_X["return_1w"])

def select_feat_by_corr(X, y, threshold=0.09):
    y.columns = ["Target"]
    data = pd.concat([X,y], axis=1)
    _corr = data.corr()[["Target"]].sort_values("Target")
    feat_to_drop=list(_corr[(_corr["Target"]< threshold)& (_corr["Target"]>-threshold)].index)
    #X.drop(feat_to_drop,axis=1,inplace=True)
    return feat_to_drop, _corr

corr_cols, a = select_feat_by_corr(X, y, 0.2)
a
pd.concat([X,y], axis=1)
示例#14
0
def preprocess(features: np.ndarray,
               target: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    features = power_transform(features, standardize=True)
    target = label_binarize(target, np.unique(target))
    return features, target
示例#15
0
p = Path(__file__).parents[1]

# To load project modules
import sys
sys.path.append(str(p))
from src.logger import LOGGER
from src import estimators as e

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(),
              name='loss',
              index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(
    open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(data=make_pipeline(
    e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform(
        X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y),
                    columns=X.filter(like='cat').drop(labels=noVarFeatures,
                                                      axis=1).columns,
                    index=X.index)
示例#16
0
文件: heatmap.py 项目: qi-zh/XenonPy
 def fit(self, desc):
     desc = minmax_scale(desc, axis=0)
     if self.bc:
         desc = power_transform(desc, method='yeo-johnson')
     self.desc = desc
     return self
示例#17
0
def gp_fit_test(x_train: Tensor,
                y_train: Tensor,
                error_train: Tensor,
                x_test: Tensor,
                y_test: Tensor,
                error_test: Tensor,
                gp_obj_model: SingleTaskGP,
                gp_error_model: SingleTaskGP,
                tkwargs: Dict[str, Any],
                gp_test_folder: str,
                obj_out_wp: bool = False,
                err_out_wp: bool = False) -> None:
    """
    1) Estimates mean test error between predicted and the true objective function values.
    2) Estimates mean test error between predicted recon. error by the gp_model and the true recon. error of the vae_model.
    :param x_train: normalised points at which the gps were trained
    :param y_train: objective value function corresponding to x_train that were used as targets of `gp_obj_model`
    :param error_train: reconstruction error value at points x_train that were used as targets of `gp_error_model`
    :param x_test: normalised test points
    :param y_test: objective value function corresponding to x_test
    :param error_test: reconstruction error at test points
    :param gp_obj_model: the gp model trained to predict the black box objective function values
    :param gp_error_model: the gp model trained to predict reconstruction error
    :param tkwargs: dict of type and device
    :param gp_test_folder: folder to save test results
    :param obj_out_wp: if the `gp_obj_model` was trained with output warping then need to apply the same transform
    :param err_out_wp: if the `gp_error_model` was trained with output warping then need to apply the same transform
    :return: (Sum_i||true_y_i - pred_y_i||^2 / n_points, Sum_i||true_recon_i - pred_recon_i||^2 / n_points)
    """
    do_robust = True if gp_error_model is not None else False
    if not os.path.exists(gp_test_folder):
        os.mkdir(gp_test_folder)

    gp_obj_model.eval()
    gp_obj_model.to(tkwargs['device'])
    y_train = y_train.view(-1)
    if do_robust:
        gp_error_model.eval()
        gp_error_model.to(tkwargs['device'])
        error_train = error_train.view(-1)

    with torch.no_grad():
        if obj_out_wp:
            Y_numpy = y_train.cpu().numpy()
            if Y_numpy.min() <= 0:
                y_train = torch.FloatTensor(
                    power_transform(Y_numpy / Y_numpy.std(),
                                    method='yeo-johnson'))
            else:
                y_train = torch.FloatTensor(
                    power_transform(Y_numpy / Y_numpy.std(), method='box-cox'))
                if y_train.std() < 0.5:
                    Y_numpy = y_train.numpy()
                    y_train = torch.FloatTensor(
                        power_transform(Y_numpy / Y_numpy.std(),
                                        method='yeo-johnson')).to(x_train)

            Y_numpy = y_test.cpu().numpy()
            if Y_numpy.min() <= 0:
                y_test = torch.FloatTensor(
                    power_transform(Y_numpy / Y_numpy.std(),
                                    method='yeo-johnson'))
            else:
                y_test = torch.FloatTensor(
                    power_transform(Y_numpy / Y_numpy.std(), method='box-cox'))
                if y_test.std() < 0.5:
                    Y_numpy = y_test.numpy()
                    y_test = torch.FloatTensor(
                        power_transform(Y_numpy / Y_numpy.std(),
                                        method='yeo-johnson')).to(x_test)

        y_train = y_train.view(-1).to(**tkwargs)
        y_test = y_test.view(-1).to(**tkwargs)

        gp_obj_val_model_mse_train = (
            gp_obj_model.posterior(x_train).mean.view(-1) -
            y_train).pow(2).div(len(y_train))
        gp_obj_val_model_mse_test = (
            gp_obj_model.posterior(x_test).mean.view(-1) - y_test).pow(2).div(
                len(y_test))
        torch.save(
            gp_obj_val_model_mse_train,
            os.path.join(gp_test_folder, 'gp_obj_val_model_mse_train.npz'))
        torch.save(gp_obj_val_model_mse_test,
                   os.path.join(gp_test_folder, 'gp_obj_val_model_test.npz'))
        print(
            f'GP training fit on objective value: MSE={gp_obj_val_model_mse_train.sum().item():.5f}'
        )
        print(
            f'GP testing fit on objective value: MSE={gp_obj_val_model_mse_test.sum().item():.5f}'
        )

        if do_robust:
            if err_out_wp:
                error_train = error_train.view(-1, 1)
                R_numpy = error_train.cpu().numpy()
                if R_numpy.min() <= 0:
                    error_train = torch.FloatTensor(
                        power_transform(R_numpy / R_numpy.std(),
                                        method='yeo-johnson'))
                else:
                    error_train = torch.FloatTensor(
                        power_transform(R_numpy / R_numpy.std(),
                                        method='box-cox'))
                    if error_train.std() < 0.5:
                        R_numpy = error_train.numpy()
                        error_train = torch.FloatTensor(
                            power_transform(R_numpy / R_numpy.std(),
                                            method='yeo-johnson')).to(x_train)

                R_numpy = error_test.cpu().numpy()
                if R_numpy.min() <= 0:
                    error_test = torch.FloatTensor(
                        power_transform(R_numpy / R_numpy.std(),
                                        method='yeo-johnson'))
                else:
                    error_test = torch.FloatTensor(
                        power_transform(R_numpy / R_numpy.std(),
                                        method='box-cox'))
                    if error_test.std() < 0.5:
                        R_numpy = error_test.numpy()
                        error_test = torch.FloatTensor(
                            power_transform(R_numpy / R_numpy.std(),
                                            method='yeo-johnson')).to(x_test)

            error_train = error_train.view(-1).to(**tkwargs)
            error_test = error_test.view(-1).to(**tkwargs)

            pred_recon_train = gp_error_model.posterior(x_train).mean.view(-1)
            pred_recon_test = gp_error_model.posterior(x_test).mean.view(-1)

            gp_error_model_mse_train = (error_train -
                                        pred_recon_train).pow(2).div(
                                            len(error_train))
            gp_error_model_mse_test = (error_test -
                                       pred_recon_test).pow(2).div(
                                           len(error_test))
            torch.save(
                gp_error_model_mse_train,
                os.path.join(gp_test_folder, 'gp_error_model_mse_train.npz'))
            torch.save(
                gp_error_model_mse_test,
                os.path.join(gp_test_folder, 'gp_error_model_mse_test.npz'))
            print(
                f'GP training fit on reconstruction errors: MSE={gp_error_model_mse_train.sum().item():.5f}'
            )
            print(
                f'GP testing fit on reconstruction errors: MSE={gp_error_model_mse_test.sum().item():.5f}'
            )
            torch.save(error_test,
                       os.path.join(gp_test_folder, f"true_rec_err_z.pt"))
            torch.save(error_train,
                       os.path.join(gp_test_folder, f"error_train.pt"))

        torch.save(x_train, os.path.join(gp_test_folder, f"train_x.pt"))
        torch.save(x_test, os.path.join(gp_test_folder, f"test_x.pt"))
        torch.save(y_train, os.path.join(gp_test_folder, f"y_train.pt"))
        torch.save(x_test, os.path.join(gp_test_folder, f"X_test.pt"))
        torch.save(y_test, os.path.join(gp_test_folder, f"y_test.pt"))

        # y plots
        plt.hist(y_train.cpu().numpy(),
                 bins=100,
                 label='y train',
                 alpha=0.5,
                 density=True)
        plt.hist(gp_obj_model.posterior(x_train).mean.view(
            -1).detach().cpu().numpy(),
                 bins=100,
                 label='y pred',
                 alpha=0.5,
                 density=True)
        plt.legend()
        plt.title('Training set')
        plt.savefig(os.path.join(gp_test_folder, 'gp_obj_train.pdf'))
        plt.close()

        plt.hist(gp_obj_val_model_mse_train.detach().cpu().numpy(),
                 bins=100,
                 alpha=0.5,
                 density=True)
        plt.title('MSE of gp_obj_val model on training set')
        plt.savefig(os.path.join(gp_test_folder, 'gp_obj_train_mse.pdf'))
        plt.close()

        plt.hist(y_test.cpu().numpy(),
                 bins=100,
                 label='y true',
                 alpha=0.5,
                 density=True)
        plt.hist(gp_obj_model.posterior(x_test).mean.detach().cpu().numpy(),
                 bins=100,
                 alpha=0.5,
                 label='y pred',
                 density=True)
        plt.legend()
        plt.title('Validation set')
        plt.savefig(os.path.join(gp_test_folder, 'gp_obj_test.pdf'))
        plt.close()

        plt.hist(gp_obj_val_model_mse_test.detach().cpu().numpy(),
                 bins=100,
                 alpha=0.5,
                 density=True)
        plt.title('MSE of gp_obj_val model on validation set')
        plt.savefig(os.path.join(gp_test_folder, 'gp_obj_test_mse.pdf'))
        plt.close()

        if do_robust:
            # error plots
            plt.hist(error_train.cpu().numpy(),
                     bins=100,
                     label='error train',
                     alpha=0.5,
                     density=True)
            plt.hist(
                gp_error_model.posterior(x_train).mean.detach().cpu().numpy(),
                bins=100,
                label='error pred',
                alpha=0.5,
                density=True)
            plt.legend()
            plt.title('Training set')
            plt.savefig(os.path.join(gp_test_folder, 'gp_error_train.pdf'))
            plt.close()

            plt.hist(gp_error_model_mse_train.detach().cpu().numpy(),
                     bins=100,
                     alpha=0.5,
                     density=True)
            plt.title('MSE of gp_error model on training set')
            plt.savefig(os.path.join(gp_test_folder, 'gp_error_train_mse.pdf'))
            plt.close()

            plt.hist(error_test.cpu().numpy(),
                     bins=100,
                     label='error true',
                     alpha=0.5,
                     density=True)
            plt.hist(
                gp_error_model.posterior(x_test).mean.detach().cpu().numpy(),
                bins=100,
                alpha=0.5,
                label='error pred',
                density=True)
            plt.legend()
            plt.title('Validation set')
            plt.savefig(os.path.join(gp_test_folder, 'gp_error_test.pdf'))
            plt.close()

            plt.hist(gp_error_model_mse_test.detach().cpu().numpy(),
                     bins=100,
                     alpha=0.5,
                     density=True)
            plt.title('MSE of gp_error model on validation set')
            plt.savefig(os.path.join(gp_test_folder, 'gp_error_test_mse.pdf'))
            plt.close()

            # y-error plots
            y_train_sorted, indices_train = torch.sort(y_train)
            error_train_sorted = error_train[indices_train]
            gp_y_train_pred_sorted, indices_train_pred = torch.sort(
                gp_obj_model.posterior(x_train).mean.view(-1))
            gp_r_train_pred_sorted = (gp_error_model.posterior(
                x_train).mean.view(-1))[indices_train_pred]
            plt.scatter(y_train_sorted.cpu().numpy(),
                        error_train_sorted.cpu().numpy(),
                        label='true',
                        marker='+')
            plt.scatter(gp_y_train_pred_sorted.detach().cpu().numpy(),
                        gp_r_train_pred_sorted.detach().cpu().numpy(),
                        label='pred',
                        marker='*')
            plt.xlabel('y train targets')
            plt.ylabel('recon. error train targets')
            plt.title('y_train vs. error_train')
            plt.legend()
            plt.savefig(
                os.path.join(gp_test_folder, 'scatter_obj_error_train.pdf'))
            plt.close()

            y_test_std_sorted, indices_test = torch.sort(y_test)
            error_test_sorted = error_test[indices_test]
            gp_y_test_pred_sorted, indices_test_pred = torch.sort(
                gp_obj_model.posterior(x_test).mean.view(-1))
            gp_r_test_pred_sorted = (gp_error_model.posterior(
                x_test).mean.view(-1))[indices_test_pred]
            plt.scatter(y_test_std_sorted.cpu().numpy(),
                        error_test_sorted.cpu().numpy(),
                        label='true',
                        marker='+')
            plt.scatter(gp_y_test_pred_sorted.detach().cpu().numpy(),
                        gp_r_test_pred_sorted.detach().cpu().numpy(),
                        label='pred',
                        marker='*')
            plt.xlabel('y test targets')
            plt.ylabel('recon. error test targets')
            plt.title('y_test vs. error_test')
            plt.legend()
            plt.savefig(
                os.path.join(gp_test_folder, 'scatter_obj_error_test.pdf'))
            plt.close()

            # error var plots
            error_train_sorted, indices_train_pred = torch.sort(error_train)
            # error_train_sorted = error_train
            # indices_train_pred = np.arange(len(error_train))
            gp_r_train_pred_sorted = gp_error_model.posterior(
                x_train).mean[indices_train_pred].view(-1)
            gp_r_train_pred_std_sorted = gp_error_model.posterior(
                x_train).variance.view(-1).sqrt()[indices_train_pred]
            plt.scatter(np.arange(len(indices_train_pred)),
                        error_train_sorted.cpu().numpy(),
                        label='err true',
                        marker='+',
                        color='C1',
                        s=15)
            plt.errorbar(
                np.arange(len(indices_train_pred)),
                gp_r_train_pred_sorted.detach().cpu().numpy().flatten(),
                yerr=gp_r_train_pred_std_sorted.detach().cpu().numpy().flatten(
                ),
                fmt='*',
                alpha=0.05,
                label='err pred',
                color='C0',
                ecolor='C0')
            plt.scatter(np.arange(len(indices_train_pred)),
                        gp_r_train_pred_sorted.detach().cpu().numpy(),
                        marker='*',
                        alpha=0.2,
                        s=10,
                        color='C0')
            # plt.scatter(np.arange(len(indices_train_pred)),
            #             (gp_r_train_pred_sorted + gp_r_train_pred_std_sorted).detach().cpu().numpy(),
            #             label='err pred mean+std', marker='.')
            # plt.scatter(np.arange(len(indices_train_pred)),
            #             (gp_r_train_pred_sorted - gp_r_train_pred_std_sorted).detach().cpu().numpy(),
            #             label='err pred mean-std', marker='.')
            plt.legend()
            plt.title('error predictions and uncertainty on train set')
            plt.savefig(
                os.path.join(gp_test_folder, 'gp_error_train_uncertainty.pdf'))
            plt.close()

            error_test_sorted, indices_test_pred = torch.sort(error_test)
            # error_test_sorted = error_test
            # indices_test_pred = np.arange(len(error_test_sorted))
            gp_r_test_pred_sorted = gp_error_model.posterior(x_test).mean.view(
                -1)[indices_test_pred]
            gp_r_test_pred_std_sorted = gp_error_model.posterior(
                x_test).variance.view(-1).sqrt()[indices_test_pred]
            plt.scatter(np.arange(len(indices_test_pred)),
                        error_test_sorted.cpu().numpy(),
                        label='err true',
                        marker='+',
                        color='C1',
                        s=15)
            plt.errorbar(
                np.arange(len(indices_test_pred)),
                gp_r_test_pred_sorted.detach().cpu().numpy().flatten(),
                yerr=gp_r_test_pred_std_sorted.detach().cpu().numpy().flatten(
                ),
                marker='*',
                alpha=0.05,
                label='err pred',
                color='C0',
                ecolor='C0')
            plt.scatter(np.arange(len(indices_test_pred)),
                        gp_r_test_pred_sorted.detach().cpu().numpy().flatten(),
                        marker='*',
                        color='C0',
                        alpha=0.2,
                        s=10)
            # plt.scatter(np.arange(len(indices_test_pred)),
            #             (gp_r_test_pred_sorted + gp_r_test_pred_std_sorted).detach().cpu().numpy(),
            #             label='err pred mean+std', marker='.')
            # plt.scatter(np.arange(len(indices_test_pred)),
            #             (gp_r_test_pred_sorted - gp_r_test_pred_std_sorted).detach().cpu().numpy(),
            #             label='err pred mean-std', marker='.')
            plt.legend()
            plt.title('error predictions and uncertainty on test set')
            plt.savefig(
                os.path.join(gp_test_folder, 'gp_error_test_uncertainty.pdf'))
            plt.close()

        # y var plots
        y_train_std_sorted, indices_train = torch.sort(y_train)
        gp_y_train_pred_sorted = gp_obj_model.posterior(
            x_train).mean[indices_train].view(-1)
        gp_y_train_pred_std_sorted = gp_obj_model.posterior(
            x_train).variance.sqrt()[indices_train].view(-1)
        plt.scatter(np.arange(len(indices_train)),
                    y_train_std_sorted.cpu().numpy(),
                    label='y true',
                    marker='+',
                    color='C1',
                    s=15)
        plt.scatter(np.arange(len(indices_train)),
                    gp_y_train_pred_sorted.detach().cpu().numpy(),
                    marker='*',
                    alpha=0.2,
                    s=10,
                    color='C0')
        plt.errorbar(
            np.arange(len(indices_train)),
            gp_y_train_pred_sorted.detach().cpu().numpy().flatten(),
            yerr=gp_y_train_pred_std_sorted.detach().cpu().numpy().flatten(),
            fmt='*',
            alpha=0.05,
            label='y pred',
            color='C0',
            ecolor='C0')
        # plt.scatter(np.arange(len(indices_train_pred)),
        #             (gp_y_train_pred_sorted+gp_y_train_pred_std_sorted).detach().cpu().numpy(),
        #             label='y pred mean+std', marker='.')
        # plt.scatter(np.arange(len(indices_train_pred)),
        #             (gp_y_train_pred_sorted-gp_y_train_pred_std_sorted).detach().cpu().numpy(),
        #             label='y pred mean-std', marker='.')
        plt.legend()
        plt.title('y predictions and uncertainty on train set')
        plt.savefig(
            os.path.join(gp_test_folder, 'gp_obj_val_train_uncertainty.pdf'))
        plt.close()

        y_test_std_sorted, indices_test = torch.sort(y_test)
        gp_y_test_pred_sorted = gp_obj_model.posterior(x_test).mean.view(
            -1)[indices_test]
        gp_y_test_pred_std_sorted = gp_obj_model.posterior(
            x_test).variance.view(-1).sqrt()[indices_test]
        plt.scatter(np.arange(len(indices_test)),
                    y_test_std_sorted.cpu().numpy(),
                    label='y true',
                    marker='+',
                    color='C1',
                    s=15)
        plt.errorbar(
            np.arange(len(indices_test)),
            gp_y_test_pred_sorted.detach().cpu().numpy().flatten(),
            yerr=gp_y_test_pred_std_sorted.detach().cpu().numpy().flatten(),
            fmt='*',
            alpha=0.05,
            label='y pred',
            color='C0',
            ecolor='C0')
        plt.scatter(np.arange(len(indices_test)),
                    gp_y_test_pred_sorted.detach().cpu().numpy(),
                    marker='*',
                    alpha=0.2,
                    s=10,
                    color='C0')
        # plt.scatter(np.arange(len(indices_test_pred)),
        #             (gp_y_test_pred_sorted + gp_y_test_pred_std_sorted).detach().cpu().numpy(),
        #             label='y pred mean+std', marker='.')
        # plt.scatter(np.arange(len(indices_test_pred)),
        #             (gp_y_test_pred_sorted - gp_y_test_pred_std_sorted).detach().cpu().numpy(),
        #             label='y pred mean-std', marker='.')
        plt.legend()
        plt.title('y predictions and uncertainty on test set')
        plt.savefig(
            os.path.join(gp_test_folder, 'gp_obj_val_test_uncertainty.pdf'))
        plt.close()
示例#18
0
 def transform(self,X):
   X = X.copy()
   X[self.features] = power_transform( X[self.features], method='yeo-johnson')
   return X
示例#19
0
print(ks_statistic, p_value)

# Shapiro Wilk test # best test
    # If the P-Value of the Shapiro Wilk Test is larger than 0.05, we assume a normal distribution
    # If the P-Value of the Shapiro Wilk Test is smaller than 0.05, we do not assume a normal distribution
from scipy import stats
shapiro_test = stats.shapiro(data_MSTL)
print(shapiro_test.statistic, shapiro_test.pvalue)

# if the data is present in non-normal shape (which it is), it can be transformed into a normal distribution using the box cox
# https://www.statisticshowto.com/box-cox-transformation/
# Normality is an important assumption for many statistical techniques; 
# if your data isn’t normal, applying a Box-Cox means that you are able to run a broader number of tests.
from sklearn.preprocessing import power_transform
xt, lmbda = stats.yeojohnson(data_MSTL)
print(power_transform(data_MSTL["Temp"].values.reshape(-1, 1), method='yeo-johnson', standardize = False))
xts = power_transform(data_MSTL["Temp"].values.reshape(-1, 1), method='yeo-johnson')
shapiro_test = stats.shapiro(xt)
print(shapiro_test.statistic, shapiro_test.pvalue)

comparison = pd.concat([data_MSTL, pd.DataFrame(xt, index = data_MSTL.index).rename(columns={0: "stats-non-standardised"}), pd.DataFrame(xts, index = data_MSTL.index).rename(columns={0: "standardised"})], axis = 1)

fig = plt.figure()
ax1 = fig.add_subplot(221)
prob = stats.probplot(comparison["Temp"], dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Probplot against normal distribution')

ax2 = fig.add_subplot(222)
prob = stats.probplot(comparison["stats-non-standardised"], dist=stats.norm, plot=ax2)
ax2.set_title('Probplot after Yeo-Johnson transformation')
示例#20
0
Y = Y - 1

n_test = int(len(df) / 10)
Y_train = Y[n_test:]
Y_test = Y[:n_test]
X = df[[
    'LineFitGeoSplit1Params.n_hits', 'SplineMPEDirectHitsICB.n_early_strings',
    'SplineMPEDirectHitsICB.n_late_doms', 'SPEFitSingleTimeSplit1.azimuth',
    'ProjectedQ.max_grad_radius_circ_F', 'ProjectedQ.ratio',
    'BestTrackCramerRaoParams.cramer_rao_theta',
    'BestTrackCramerRaoParams.variance_theta',
    'BestTrackCramerRaoParams.variance_x',
    'BestTrackCramerRaoParams.variance_y',
    'BestTrackCramerRaoParams.covariance_theta_y',
    'SplineMPETruncatedEnergy_SPICEMie_DOMS_Muon.energy',
    'SplineMPETruncatedEnergy_SPICEMie_BINS_Muon.energy',
    'SPEFit2TimeSplit1BayesianFitParams.nmini',
    'LineFitTimeSplit2Params.n_hits', 'BestTrackDirectHitsICB.n_dir_pulses',
    'HitStatisticsValues.min_pulse_time', 'SplineMPEDirectHitsICE.n_dir_doms',
    'SplineMPEDirectHitsICE.n_late_strings', 'MPEFit_HVFitParams.nmini'
]]
#'SplineMPECharacteristicsIC.avg_dom_dist_q_tot_dom',
#'MPEFitHighNoiseFitParams.nmini']]
X_box = power_transform(X, method='yeo-johnson')
X_btrain = X_box[n_test:]  #splitting the dataframe
X_btest = X_box[:n_test]
estimator = LogisticAT()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector.fit(X_box, Y)
print(selector.ranking_)
df.hist(bins=50, figsize=(20, 20))
plt.show()

#now we transform the data to Gaussian and delete outliers to see better evry features

dfT = pd.DataFrame()
for c in data.columns:
    #plt.figure(i)
    if c[0] != 'V':
        continue
    x = data[c]
    x = x.sort_values()
    x = x[20000:-20000]
    #x=sklearn.preprocessing.PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
    X = power_transform(x[:, np.newaxis],
                        method='yeo-johnson',
                        standardize=True,
                        copy=True)

    #print(X.shape)
    dfT[c] = X.squeeze()

dfT.hist(bins=50, figsize=(20, 20))
plt.show()

#Transform the data for work - no delete outlier
#we use method 'yeo-johnson' because we have negative values
dataT = pd.DataFrame()
for c in data.columns:
    #plt.figure(i)
    if c[0] != 'V':
        continue
示例#22
0
    def suggest(self, n_suggestions=1, fix_input = None):
        if self.X.shape[0] < self.rand_sample:
            sample = self.quasi_sample(n_suggestions, fix_input)
            return sample
        else:
            X, Xe = self.space.transform(self.X)
            try:
                if self.y.min() <= 0:
                    y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'yeo-johnson'))
                else:
                    y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'box-cox'))
                    if y.std() < 0.5:
                        y = torch.FloatTensor(power_transform(self.y / self.y.std(), method = 'yeo-johnson'))
                if y.std() < 0.5:
                    raise RuntimeError('Power transformation failed')
                model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config)
                model.fit(X, Xe, y)
            except:
                y     = torch.FloatTensor(self.y).clone()
                model = get_model(self.model_name, self.space.num_numeric, self.space.num_categorical, 1, **self.model_config)
                model.fit(X, Xe, y)

            best_id = np.argmin(self.y.squeeze())
            best_x  = self.X.iloc[[best_id]]
            best_y  = y.min()
            py_best, ps2_best = model.predict(*self.space.transform(best_x))
            py_best = py_best.detach().numpy().squeeze()
            ps_best = ps2_best.sqrt().detach().numpy().squeeze()

            iter  = max(1, self.X.shape[0] // n_suggestions)
            upsi  = 0.5
            delta = 0.01
            # kappa = np.sqrt(upsi * 2 * np.log(iter **  (2.0 + self.X.shape[1] / 2.0) * 3 * np.pi**2 / (3 * delta)))
            kappa = np.sqrt(upsi * 2 * ((2.0 + self.X.shape[1] / 2.0) * np.log(iter) + np.log(3 * np.pi**2 / (3 * delta))))

            acq = MACE(model, py_best, kappa = kappa) # LCB < py_best
            mu  = Mean(model)
            sig = Sigma(model, linear_a = -1.)
            opt = EvolutionOpt(self.space, acq, pop = 100, iters = 100, verbose = False)
            rec = opt.optimize(initial_suggest = best_x, fix_input = fix_input).drop_duplicates()
            rec = rec[self.check_unique(rec)]

            cnt = 0
            while rec.shape[0] < n_suggestions:
                rand_rec = self.quasi_sample(n_suggestions - rec.shape[0], fix_input)
                rand_rec = rand_rec[self.check_unique(rand_rec)]
                rec      = rec.append(rand_rec, ignore_index = True)
                cnt +=  1
                if cnt > 3:
                    # sometimes the design space is so small that duplicated sampling is unavoidable
                    break 
            if rec.shape[0] < n_suggestions:
                rand_rec = self.quasi_sample(n_suggestions - rec.shape[0], fix_input)
                rec      = rec.append(rand_rec, ignore_index = True)

            select_id = np.random.choice(rec.shape[0], n_suggestions, replace = False).tolist()
            x_guess   = []
            with torch.no_grad():
                py_all       = mu(*self.space.transform(rec)).squeeze().numpy()
                ps_all       = -1 * sig(*self.space.transform(rec)).squeeze().numpy()
                best_pred_id = np.argmin(py_all)
                best_unce_id = np.argmax(ps_all)
                if best_unce_id not in select_id and n_suggestions > 2:
                    select_id[0]= best_unce_id
                if best_pred_id not in select_id and n_suggestions > 2:
                    select_id[1]= best_pred_id
                rec_selected = rec.iloc[select_id].copy()
            return rec_selected
# To load project modules
import sys; sys.path.append(str(p))
from src.logger import LOGGER
from src import estimators as e
from src.ranker import Ranker


A4_DIMS = (11.7, 8.27)

LOGGER.info('Load data')
df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl'))
X = df.drop(labels='loss', axis=1)
y = df['loss'].copy()

LOGGER.info('Process target')
y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index)

LOGGER.info('Load categorical features to drop')
noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r'))

LOGGER.info('Process categorical features')
catf = pd.DataFrame(
    data=make_pipeline(
        e.CategoricalGrouper(),
        e.CategoricalEncoder()
    ).fit_transform(X.filter(like='cat').drop(labels=noVarFeatures, axis=1), y),
    columns=X.filter(like='cat').drop(labels=noVarFeatures, axis=1).columns,
    index=X.index
)

LOGGER.info('Process continuous features')
示例#24
0
...      [ -2.,  1.,  3.],
...      [ 4.,  1., -2.]]
>>> transformer = RobustScaler().fit(X)
>>> transformer
RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True)
       with_scaling=True)
>>> transformer.transform(X)
array([[ 0. , -2. ,  0. ],
       [-1. ,  0. ,  0.4],
       [ 1. ,  0. , -1.6]])

#Power transform
>>> import numpy as np
>>> from sklearn.preprocessing import power_transform
>>> data = [[1, 2], [3, 2], [4, 5]]
>>> print(power_transform(data, method='box-cox'))
[[-1.332... -0.707...]
 [ 0.256... -0.707...]
 [ 1.076...  1.414...]]


#функция взаимодействий

from itertools import combinations

def interactions(data):
    columns=list(data.columns)
    ls=list(combinations(columns, 2))
    for inter in ls:
        print(inter[0], inter[1])
        data[str(inter[0])+'_'+str(inter[1])]=data[str(inter[0])]+data[str(inter[1])]
plt.subplots(figsize=(12, 8))
sns.residplot(train_final_1.KitchenQual,
              train_final_1.SalePrice).set_title('KITC W/out influential')

# Megaphone effect
plt.subplots(figsize=(12, 8))
sns.residplot(
    train_final_1.OverallQual,
    train_final_1.SalePrice).set_title('OverallQual W/out influential')

#------------------------------------------------------------------------------
### Transforming the data with boxcox

## Using power transform, method = boxcox
# The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood
print(power_transform(train_final_1, method='box-cox'))
train_final_1_boxcox = power_transform(train_final_1, method='box-cox')

## Converting the new boxcox np array back into a pd dataframe
# I can't believe this worked and I am so proud of myself
train_final_boxcox = pd.DataFrame(train_final_1_boxcox,
                                  index=train_final_1.index,
                                  columns=train_final_1.columns)

## Running the final reg. model with transformed dataframe
X = train_final_boxcox[[
    "OverallQual", "TotalBsmtSF", "GrLivArea", "KitchenQual"
]]
y = train_final_boxcox["SalePrice"]
X = sm.add_constant(X)
示例#26
0
        ax = df_normalized.loc[str(anio), j].plot()
        ax.set_ylabel('Columnas');
        ax.set_xlabel('Anios');
'''


# Probar transformaciones: 
# Usaremos 2: 
# BoxCox: 
# Logit:  logit(p) = log(p/(1-p))


# Primera transformacion
from sklearn.preprocessing import power_transform
df_normalized = df_normalized.replace(0, 0.00001)
transf_boxcox = power_transform(df_normalized, method='box-cox')
df_boxcox = pd.DataFrame(data = transf_boxcox)
df_boxcox.plot()
df_normalized = df_normalized.replace(0.00001, 0)

# Esta es la misma transformacion pero con otro metodo
#transf_yeo = power_transform(df_indicators, method='yeo-johnson')
#dfyeo = pd.DataFrame(data = transf_yeo)
#dfyeo.plot()

# Esta es la misma transformacion pero estandarizada, solo se mueven los ejes pero la curva es igual
#from sklearn.preprocessing import PowerTransformer
#power = PowerTransformer(method='yeo-johnson', standardize=True)
#data_trans = power.fit_transform(dfyeo)
#dfyeostandardized = pd.DataFrame(data = data_trans)
#dfyeostandardized.plot()
import pandas as pd

Data = pd.read_csv('hack_final.csv')
from sklearn.preprocessing import power_transform

x = Data[['Click_count_y', 'Unique_products']]
power_transform(x, method='box-cox')
y = Data['y']
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(x, y)
from sklearn.externals import joblib

joblib.dump(regressor, 'model.pkl')
示例#28
0
def cleanData(df):
    # Drop variables with little variance
    df = df.drop([
        'Id', 'Alley', 'Street', 'LotShape', 'Utilities', 'LandSlope',
        'RoofMatl', 'Heating', 'Electrical', 'BsmtFinSF1', 'BsmtFinType2',
        'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr',
        '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscFeature',
        'MiscVal'
    ],
                 axis=1)

    # TotalSF variable
    df.loc[:, 'TotalSF'] = df['TotalBsmtSF'].apply(
        lambda x: 0 if pd.isna(x) else x) + df['1stFlrSF'] + df['2ndFlrSF']

    # Convert some integral classes to categorical
    df.MSSubClass = df.MSSubClass.astype('str')
    df.MoSold = df.MoSold.astype('str')
    df.YrSold = df.YrSold.astype('str')

    # Bin rare categories
    df.loc[df.MSSubClass.isin(['180', '75', '45', '40', '150']),
           'MSSubClass'] = 'Other'
    df.loc[df.MSZoning.isin(['RH', 'C (all)']), 'MSZoning'] = 'Other'
    df.loc[df.Neighborhood.isin(['Blueste', 'NPkVill']),
           'Neighborhood'] = 'Other'
    df.loc[df.Condition1.isin(['RRAe', 'RRAn', 'RRNe', 'RRNn']),
           'Condition1'] = 'Near railroad'
    df.loc[df.Condition1.isin(['PosA', 'PosN']),
           'Condition1'] = 'Near positive feature'
    df.loc[df.Condition2.isin(['RRAe', 'RRAn', 'RRNe', 'RRNn']),
           'Condition2'] = 'Near railroad'
    df.loc[df.Condition2.isin(['PosA', 'PosN']),
           'Condition2'] = 'Near positive feature'
    df.loc[df.HouseStyle.isin(['2.5Unf', '2.5Fin']), 'HouseStyle'] = '2.5'

    def lengthMap(x):
        if x == 0 or math.isnan(x):
            area = 'None'
        else:
            area = str(
                x // 50 * 50) + ' to ' + str((x // 50 + 1) * 50 - 1) + ' ft.'
        return area

    df.LotFrontage = df.LotFrontage.apply(lambda x: lengthMap(x))

    def remodelAgeMap(x):
        if x == 1950:
            era = 'No remodel'
        elif x > 1950 and x < 1960:
            era = '1950s'
        elif x >= 1960 and x < 1970:
            era = '1960s'
        elif x >= 1970 and x < 1980:
            era = '1970s'
        elif x >= 1980 and x < 1990:
            era = '1980s'
        elif x >= 1990 and x < 2000:
            era = '1990s'
        elif x >= 2000 and x < 2010:
            era = '2000s'
        else:
            era = '2010s'
        return era

    df.loc[:, 'RemodelEra'] = df.YearRemodAdd.apply(lambda x: remodelAgeMap(x))
    df = df.drop('YearRemodAdd', axis=1)

    df.loc[df.Exterior2nd.isin(['Wd Shng']), 'Exterior2nd'] = 'WdShing'
    df.loc[df.Exterior2nd.isin(['CmentBd']), 'Exterior2nd'] = 'CemntBd'
    df.loc[df.Exterior2nd.isin(['Brk Cmn']), 'Exterior2nd'] = 'BrkComm'

    df.loc[df.RoofStyle.isin(['Flat', 'Gambrel', 'Mansard', 'Shed']),
           'RoofStyle'] = 'Other'
    df.loc[df.Exterior1st.
           isin(['AsphShn', 'ImStucc', 'CBlock', 'Stone', 'BrkComm']),
           'Exterior1st'] = 'Other'
    df.loc[df.Exterior2nd.
           isin(['AsphShn', 'ImStucc', 'CBlock', 'Stone', 'BrkComm']),
           'Exterior2nd'] = 'Other'

    def areaMap(x):
        if x == 0:
            area = 'None'
        else:
            area = str(x // 50 * 50) + ' to ' + str((x // 50 + 1) * 50 -
                                                    1) + ' sq. ft.'
        return area

    df.loc[:, 'VeneerArea'] = df.MasVnrArea.apply(lambda x: areaMap(x))
    df = df.drop('MasVnrArea', axis=1)

    df.loc[df.ExterCond.isin(['Po', 'Fa']), 'ExterCond'] = 'Fa'
    df.loc[df.ExterCond.isin(['Gd', 'Ex']), 'ExterCond'] = 'Gd'
    df.loc[df.Foundation.isin(['Wood', 'Stone', 'Slab']),
           'Foundation'] = 'Other'
    df.loc[df.BsmtCond.isin(['Po', 'Fa']), 'BsmtCond'] = 'Fa'

    df.loc[:,
           'BasementUnfinishedSF'] = df.BsmtUnfSF.apply(lambda x: areaMap(x))
    df = df.drop('BsmtUnfSF', axis=1)

    df.loc[:, 'TotalBasementSF'] = df.TotalBsmtSF.apply(lambda x: areaMap(x))
    df = df.drop('TotalBsmtSF', axis=1)

    df.loc[df.HeatingQC.isin(['Po', 'Fa']), 'HeatingQC'] = 'Fa'

    df.loc[:, 'TotalIndoorSF'] = df['1stFlrSF'] + df['2ndFlrSF']
    df = df.drop(['1stFlrSF', '2ndFlrSF'], axis=1)

    df.loc[:, 'TwoBasementFullBath'] = df.BsmtFullBath.apply(
        lambda x: 'Yes' if x == 2 else 'No')
    df = df.drop('BsmtFullBath', axis=1)

    df.loc[:, 'TwoHalfBath'] = df.HalfBath.apply(lambda x: 'Yes'
                                                 if x == 2 else 'No')
    df = df.drop('HalfBath', axis=1)

    df.loc[df.Functional.isin(['Maj1', 'Maj2', 'Sev']), 'Functional'] = 'Other'
    df.loc[df.Functional.isin(['Min1', 'Min2']), 'Functional'] = 'Minimial'

    df.loc[df.GarageType.isin(['CarPort', '2Types']), 'GarageType'] = 'Other'

    df.GarageArea = df.GarageArea.apply(lambda x: areaMap(x))
    df.loc[df.GarageQual.isin(['Ex', 'Gd']), 'GarageQual'] = 'Gd'
    df.loc[df.GarageQual.isin(['Po', 'Fa']), 'GarageQual'] = 'Fa'

    df.loc[df.GarageCond.isin(['Ex', 'Gd']), 'GarageCond'] = 'Gd'
    df.loc[df.GarageCond.isin(['Po', 'Fa']), 'GarageCond'] = 'Fa'

    df.WoodDeckSF = df.WoodDeckSF.apply(lambda x: areaMap(x))
    df.OpenPorchSF = df.OpenPorchSF.apply(lambda x: areaMap(x))
    df.EnclosedPorch = df.EnclosedPorch.apply(lambda x: areaMap(x))

    df.loc[df.Fence.isin(['MnWw']), 'Fence'] = 'MnPrv'

    df.loc[df.SaleType.isin(['Con', 'Oth', 'CWD', 'ConLI', 'ConLw', 'ConLD']),
           'SaleType'] = 'Other'
    df.loc[df.SaleCondition.isin(['AdjLand', 'Alloca']),
           'SaleCondition'] = 'Other'

    # Impute missing values with a "None" feature or a computed feature
    df.loc[df.Fence.isna(), 'Fence'] = 'None'
    df.loc[df.FireplaceQu.isna(), 'FireplaceQu'] = 'None'
    df.loc[df.GarageCond.isna(), 'GarageCond'] = 'None'
    df.loc[df.GarageYrBlt.isna(), 'GarageYrBlt'] = 'None'
    df.loc[df.GarageFinish.isna(), 'GarageFinish'] = 'None'
    df.loc[df.GarageQual.isna(), 'GarageQual'] = 'None'
    df.loc[df.GarageType.isna(), 'GarageType'] = 'None'
    df.loc[df.BsmtCond.isna(), 'BsmtCond'] = 'None'
    df.loc[df.BsmtExposure.isna(), 'BsmtExposure'] = 'None'
    df.loc[df.BsmtQual.isna(), 'BsmtQual'] = 'None'
    df.loc[df.BsmtFinType1.isna(), 'BsmtFinType1'] = 'None'
    df.loc[df.MSZoning.isna(), 'MSZoning'] = 'Other'
    df.loc[df.Functional.isna(), 'Functional'] = 'Other'
    df.loc[df.SaleType.isna(), 'SaleType'] = 'Other'
    df.loc[df.KitchenQual.isna(), 'KitchenQual'] = df.groupby(
        'KitchenQual').KitchenQual.count().sort_values(
            ascending=False).index[1]
    df.loc[df.GarageCars.isna(), 'GarageCars'] = 0
    df.loc[df.Exterior1st.isna(), 'Exterior1st'] = 'Other'
    df.loc[df.Exterior2nd.isna(), 'Exterior2nd'] = 'Other'
    df.loc[df.MasVnrType.isna(), 'MasVnrType'] = 'None'

    # Apply Yeo-Johnson transformation to all numeric variables
    for i in df.columns:
        if df[i].dtype.name != 'object' and df[i].name != 'SalePrice':
            df[i] = power_transform(df[i].values.reshape(-1, 1),
                                    method='yeo-johnson')

    # One hot encode categoricals
    df = pd.get_dummies(df)

    table = {
        'Condition':
        ['Norm', 'Feedr', 'Near positive feature', 'Artery', 'Near railroad'],
        'Exterior': [
            'VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing',
            'CemntBd', 'Plywood', 'AsbShng', 'Stucco', 'Other'
        ]
    }

    # Combine Exterior1st and Exterior2nd features
    # Combine Condition1 and Condition2 features
    def transformCols(row):
        for name in table['Condition']:
            row['Condition' + '_' + name] = max(row['Condition1_' + name],
                                                row['Condition2_' + name])

        for name in table['Exterior']:
            row['Exterior' + '_' + name] = max(row['Exterior1st_' + name],
                                               row['Exterior2nd_' + name])

        return row

    df = df.transform(transformCols, axis=1)

    for name in table['Condition']:
        df.drop(['Condition1_' + name, 'Condition2_' + name],
                axis=1,
                inplace=True)

    for name in table['Exterior']:
        df.drop(['Exterior1st_' + name, 'Exterior2nd_' + name],
                axis=1,
                inplace=True)

    df.SalePrice = np.log(df.SalePrice)

    return [df[df.Type_train == 1], df[df.Type_test == 1]]
示例#29
0
    def suggest(self, n_suggestions=1):
        if self.X.shape[0] < 4 * n_suggestions:
            df_suggest = self.quasi_sample(n_suggestions)
            x_guess = []
            for i, row in df_suggest.iterrows():
                x_guess.append(row.to_dict())
        else:
            X, Xe = self.space.transform(self.X)
            try:
                if self.y.min() <= 0:
                    y = torch.FloatTensor(
                        power_transform(self.y / self.y.std(),
                                        method='yeo-johnson'))
                else:
                    y = torch.FloatTensor(
                        power_transform(self.y / self.y.std(),
                                        method='box-cox'))
                    if y.std() < 0.5:
                        y = torch.FloatTensor(
                            power_transform(self.y / self.y.std(),
                                            method='yeo-johnson'))
                if y.std() < 0.5:
                    raise RuntimeError('Power transformation failed')
                model = get_model(self.model_name, self.space.num_numeric,
                                  self.space.num_categorical, 1,
                                  **self.model_config)
                model.fit(X, Xe, y)
            except:
                print('Error fitting GP')
                y = torch.FloatTensor(self.y).clone()
                filt, q = self.filter(y)
                print('Q = %g, kept = %d/%d' %
                      (q, y.shape[0], self.y.shape[0]))
                X = X[filt]
                Xe = Xe[filt]
                y = y[filt]
                model = get_model(self.model_name, self.space.num_numeric,
                                  self.space.num_categorical, 1,
                                  **self.model_config)
                model.fit(X, Xe, y)
            print('Noise level: %g' % model.noise, flush=True)

            best_id = np.argmin(self.y.squeeze())
            best_x = self.X.iloc[[best_id]]
            best_y = y.min()
            py_best, ps2_best = model.predict(*self.space.transform(best_x))
            py_best = py_best.detach().numpy().squeeze()
            ps_best = ps2_best.sqrt().detach().numpy().squeeze()

            # XXX: minimize (mu, -1 * sigma)
            #      s.t.     LCB < best_y
            iter = max(1, self.X.shape[0] // n_suggestions)
            upsi = 0.5
            delta = 0.01
            kappa = np.sqrt(
                upsi * 2 *
                np.log(iter**(2.0 + self.X.shape[1] / 2.0) * 3 * np.pi**2 /
                       (3 * delta)))

            acq = MACE(model, py_best, kappa=kappa)  # LCB < py_best
            mu = Mean(model)
            sig = Sigma(model, linear_a=-1.)
            opt = EvolutionOpt(self.space,
                               acq,
                               pop=100,
                               iters=100,
                               verbose=True)
            rec = opt.optimize(initial_suggest=best_x).drop_duplicates()
            rec = rec[self.check_unique(rec)]

            cnt = 0
            while rec.shape[0] < n_suggestions:
                rand_rec = self.quasi_sample(n_suggestions - rec.shape[0])
                rand_rec = rand_rec[self.check_unique(rand_rec)]
                rec = rec.append(rand_rec, ignore_index=True)
                cnt += 1
                if cnt > 3:
                    break
            if rec.shape[0] < n_suggestions:
                rand_rec = self.quasi_sample(n_suggestions - rec.shape[0])
                rec = rec.append(rand_rec, ignore_index=True)

            select_id = np.random.choice(rec.shape[0],
                                         n_suggestions,
                                         replace=False).tolist()
            x_guess = []
            with torch.no_grad():
                py_all = mu(*self.space.transform(rec)).squeeze().numpy()
                ps_all = -1 * sig(*self.space.transform(rec)).squeeze().numpy()
                best_pred_id = np.argmin(py_all)
                best_unce_id = np.argmax(ps_all)
                if best_unce_id not in select_id and n_suggestions > 2:
                    select_id[0] = best_unce_id
                if best_pred_id not in select_id and n_suggestions > 2:
                    select_id[1] = best_pred_id
                rec_selected = rec.iloc[select_id].copy()
                py, ps2 = model.predict(*self.space.transform(rec_selected))
                rec_selected['py'] = py.squeeze().numpy()
                rec_selected['ps'] = ps2.sqrt().squeeze().numpy()
                print(rec_selected)
            print('Best y is %g %g %g %g' %
                  (self.y.min(), best_y, py_best, ps_best),
                  flush=True)
            for idx in select_id:
                x_guess.append(rec.iloc[idx].to_dict())

        for rec in x_guess:
            for name in rec:
                if self.api_config[name]['type'] == 'int':
                    rec[name] = int(rec[name])
        return x_guess
示例#30
0
# In[23]:

iplot(
    dict(data=[
        dict(type='violin', name=name, y=data, box=dict(visible=True))
        for name, data in zip(standardized_feature_names, (
            standardized_features[:, j] for j in count()))
    ],
         layout=dict(title="Standardized Population Distribution by Feature")))

# ### Power Transform Features

# In[24]:

power_transformed_features = power_transform(raw_features, standardize=True)
power_transformed_feature_names = [
    name.partition(' (cm)')[0] for name in feature_names
]

# In[25]:

iplot(
    dict(
        data=[
            dict(type='violin', name=name, y=data, box=dict(visible=True))
            for name, data in zip(power_transformed_feature_names, (
                power_transformed_features[:, j] for j in count()))
        ],
        layout=dict(
            title=