Exemplo n.º 1
0
    def test_basic(self, output_distribution):
        rs = da.random.RandomState(0)
        a = dpp.QuantileTransformer(output_distribution=output_distribution)
        b = spp.QuantileTransformer(output_distribution=output_distribution)

        X = rs.uniform(size=(1000, 3), chunks=50)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(a, b, atol=0.02)

        # set the quantiles, so that from here out, we're exact
        a.quantiles_ = b.quantiles_
        assert_eq_ar(a.transform(X), b.transform(X), atol=1e-7)
        assert_eq_ar(X, a.inverse_transform(a.transform(X)))
Exemplo n.º 2
0
    def scaler(self):
        df = self.scaler_df

        scaled_df = pd.DataFrame()
        corr_df = pd.DataFrame()
        scaler_dict = {}

        for i, column in enumerate(df):

            # print(i, column)
            pd_series = df[column]

            gaussian_scaler = preprocessing.QuantileTransformer(
                output_distribution='normal')
            x = pd_series.values.reshape(-1, 1)  # returns a numpy array
            x_scaled = gaussian_scaler.fit_transform(x)
            x_scaled = pd.Series(x_scaled[:, 0])

            # check for values correlation with original values at decile 10 and 90
            thres_list = [0.05, 0.1]
            x_thres = self.quantile_check(x, thres_list)

            df2 = pd.concat(
                [x_scaled, pd_series.reset_index(drop=True)],
                axis=1,
                ignore_index=True)
            df2.columns = ['transform', 'ori']

            for tuples in x_thres:
                df3 = df2.loc[df2['ori'].between(tuples[0], tuples[1])]
                corr = df3.corr()
                corr_value = corr.iloc[0, 1]
                temp_df = pd.DataFrame([column, tuples[0], corr_value]).T
                corr_df = corr_df.append(temp_df, ignore_index=True)

            scaled_df = pd.concat((scaled_df, x_scaled),
                                  axis=1,
                                  ignore_index=True)
            scaler_filename = '/home/hchong/Documents/kaggle/plasticc/scaler/{column}.save'.format(
                column=column)
            joblib.dump(gaussian_scaler, scaler_filename)
            scaler_dict[column] = gaussian_scaler

        scaled_df.columns = [x for x in df.columns]
        corr_df.columns = ['columns', 'threshold', 'correlation']

        self.scaled_df = scaled_df
        self.scaler_dict = scaler_dict
        self.scaled_corr_df = corr_df
def normalize_data(X, feature_dict, using_features, dont_show=True):
    """
    Inputs:
     -  X: numpy array of shape(num of patients x num of features)
            eg: 132 x 16.
     -  feature_index: list of ints, corresponding to the txt files.
            eg: [0, 3, 41].
    Outputs:
     -  A normalized X, of same shape.
    Function:
     -  Firstly, Normalize X by numpy.preprocessing.
     -  Secondly, If -9999 is in a feature, we normalize this feature again, but without the -9999 ones.
        Thirdly, we paste the normalized feature back to X.
            e.g: - X = [1, 2, -9999]
                 - Firstly, normalized to [0.01, 0.02, -1.5]. 
                 - Secondly, [1, 2] is normalized to [0.5, 0.6].
                 - Thirdly, paste back, we get [0.5, 0.6, -1.5].
        This could keep the shape of X, makes indexing easier and doesn't harm the performance of kNN.
     """
    X = X
    feature_dict = feature_dict
    features = using_features
    num_patients, num_features = X.shape
    full_data_patients = np.arange(num_patients)
    miss_data_patients = []
    miss_data_features = []
    
    for i in range(num_patients):
        for j in range(num_features):
            if X[i][j] < -9998:
                miss_data_patients.append(i)
                if j not in miss_data_features:
                    miss_data_features.append(j)
                break
    full_data_patients = np.delete(full_data_patients, miss_data_patients)
    if not dont_show:
        print("\nMissing data features:")
        for j in miss_data_features:
            print(feature_dict[features[j]])
        print("\nFull data patients:")
        print(full_data_patients)
    #X_normalized = preprocessing.normalize(X, norm='l2', axis=0)
    scaler = preprocessing.QuantileTransformer(output_distribution='uniform')
    X_normalized = scaler.fit_transform(X)
    if miss_data_features:
        X_miss_data_feature = X[full_data_patients, miss_data_features].reshape(-1, 1) # Miss data feature of patients with full data.
        X_miss_data_feature_normalized = preprocessing.normalize(X_miss_data_feature, norm='l2', axis=0).reshape(-1)  
        X_normalized[full_data_patients, miss_data_features] = X_miss_data_feature_normalized
    return X_normalized
Exemplo n.º 4
0
    def impute_method_setup(
            self,
            random_state=DEFAULT_IMPUTER_RANDOM_STATE,
            add_indicator=DEFAULT_IMPUTER_ADD_INDICATOR,
            initial_strategy=DEFAULT_IMPUTER_INITIAL_STRATEGY,
            max_iter=DEFAULT_IMPUTER_MAX_ITER,
            estimator=DEFAULT_IMPUTER_ESTIMATOR,
            output_distribution=DEFAULT_TRANSFORMER_OUTPUT_DISTRIBUTION,
            transformer_method=DEFAULT_TRANSFORMER_METHOD,
            transformer_standardize=DEFAULT_TRANSFORMER_STANDARDIZE):
        """ Initialises the IterativeImputer, QuantileTransformer and PowerTransformer methods required 
            if missing data is to be imputed.
            
            Parameters are passed to the sklearn routines. Where this is being done it is noted below. 
            For further documentation on how these functions work, and what the parameters denote, 
            please refer to the sklearn documentation.

            IterativeImputer:    https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
            QuantileTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
            PowerTransformer:    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
            
            Args:
                random_state:           (int) (IterativeImputer & QuantileTransformer) seed for pseudo random number generator
                add_indicator:          (boolean) (IterativeImputer) if True adds a `MissingIndicator` transform to the stack
                initial_strategy:       (str) (IterativeImputer) define strategy to use for initialising missing values
                max_iter:               (int) (IterativeImputer) maximum number of imputation rounds to perform
                estimator:              (str) (IterativeImputer) estimator method to be used
                output_distribution:    (str) (QuantileTransformer) Marginal distribution for the transformed data
                transformer_method      (str) (PowerTransformer) method to use, 'box-cox' is default
                transformer_standardize (boolean) (PowerTransformer) select if zero-mean, unit-variance normalisation is applied, default is True

             Returns: None
        """

        # set the imputer options (if we are using them)
        self.imputer = IterativeImputer(random_state=random_state,
                                        add_indicator=add_indicator,
                                        initial_strategy=initial_strategy,
                                        max_iter=max_iter,
                                        verbose=self.verbose,
                                        estimator=estimator)

        # set the power transform options
        self.transformer_quantile = preprocessing.QuantileTransformer(
            output_distribution=output_distribution, random_state=random_state)

        # set the power transform options
        self.transformer_power = preprocessing.PowerTransformer(
            method=transformer_method, standardize=transformer_standardize)
Exemplo n.º 5
0
def feature_scale(feature_scaler, X_train, y_train):

    # more information about these scalers can be found @
    # https://scikit-learn.org/stable/modules/preprocessing.html

    if feature_scaler == 'binarizer':
        # scale the X values in the set
        model = preprocessing.Binarizer()

    elif feature_scaler == 'one_hot_encoder':
        '''
		>>> enc.transform([['female', 'from US', 'uses Safari'],
			             	['male', 'from Europe', 'uses Safari']]).toarray()
			array([[1., 0., 0., 1., 0., 1.],
			       [0., 1., 1., 0., 0., 1.]])
		'''
        # This is on y values
        model = preprocessing.OneHotEncoder(handle_unknown='ignore')

    elif feature_scaler == 'maxabs':
        model = preprocessing.MaxAbsScaler()

    elif feature_scaler == 'minmax':
        model = preprocessing.MinMaxScaler()

    elif feature_scaler == 'normalize':
        # L2 normalization
        model = preprocessing.Normalizer()

    elif feature_scaler == 'poly':
        # scale the X values in the set
        model = PolynomialFeatures(2)

    elif feature_scaler == 'power_transformer':
        # scale the X values in the set
        model = preprocessing.PowerTransformer(method='yeo-johnson')

    elif feature_scaler == 'quantile_transformer_normal':
        # scale the X values in the set
        model = preprocessing.QuantileTransformer(output_distribution='normal')

    elif feature_scaler == 'robust':
        model = preprocessing.RobustScaler(quantile_range=(25, 75))

    elif feature_scaler == 'standard_scaler':
        # scale the X values in the set
        model = preprocessing.StandardScaler()

    return model
def train_validate(df):

    print("[INFO] preparing X_train / y_train...")

    df = df.sample(frac=0.7, replace=True)

    id = pd.DataFrame(data = df, columns=["id", "molecule_name"])

    y = pd.DataFrame(data = df, columns=["scalar_coupling_constant"])

    # Split the 'features' and 'income' data into training and testing sets
    X_train, X_val, y_train, y_val = train_test_split(df.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1),
                                                          y,
                                                          test_size = 0.20)

    normalization = input("Which type of normalization do you want? (standardScalar, minMax, quartile, normal with l1, normal with l2, )...   ")

    print("[INFO] Preparing normalization...")

    if normalization == "standardScalar":
        scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(X_train)
    elif normalization == "minMax":
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train = min_max_scaler.fit_transform(X_train)
    elif normalization == "quartile":
         quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
         X_train = quantile_transformer.fit_transform(X_train)
    elif normalization == "normal with l1":
         norm = 'l1'
         X_train = preprocessing.normalize(X_train, norm=norm)
    else:
        norm = 'l2'
        X_train = preprocessing.normalize(X_train, norm=norm)


    print("Datasets: Prepared")
    print("Training sets have shape {} and {}.".format(X_train.shape, y_train.shape))
    print("Validation sets have shape {} and {}.".format(X_val.shape, y_val.shape))

    print("[INFO] saving data...")

    np.save(os.path.join(DATA_DIR,'X_train.npy'), X_train)
    np.save(os.path.join(DATA_DIR,'X_val.npy'), X_val)
    np.save(os.path.join(DATA_DIR,'y_train.npy'), y_train)
    np.save(os.path.join(DATA_DIR,'y_val.npy'), y_val)

    print("[INFO] data saved as numpy arrays...")

    print("[INFO] completed...")
Exemplo n.º 7
0
def soft_voting_1(df_res, y):
    print('\n')
    print('SOFT VOTING')

    #    min_max_scaler = preprocessing.MinMaxScaler()
    #    df_res = min_max_scaler.fit_transform(df_res)

    #    robust_scaler = preprocessing.RobustScaler()
    #    df_res = robust_scaler.fit_transform(df_res)

    quantile = preprocessing.QuantileTransformer()
    df_res = quantile.fit_transform(df_res)

    clf1 = ensemble.AdaBoostClassifier()
    clf2 = MLPClassifier(
    )  #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1)
    clf3 = ensemble.GradientBoostingClassifier(
    )  #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11)
    clf4 = SGDClassifier(
        loss='log', max_iter=1000
    )  #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log')
    clf5 = LogisticRegression()
    clf6 = LogisticRegressionCV()
    clf7 = QuadraticDiscriminantAnalysis()
    clf8 = GaussianNB()
    clf9 = KNeighborsClassifier(3)
    clf10 = SVC(probability=True)
    eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2),
                                        ('gbs', clf3), ('sgdc', clf4),
                                        ('lgr', clf5), ('lrcv', clf6),
                                        ('qda', clf7), ('gnb', clf8),
                                        ('knn', clf9), ('cvc', clf10)],
                            voting='soft',
                            weights=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    for clf, label in zip(
        [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clf10, eclf], [
            'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting',
            'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV',
            'QuadraticDiscriminantAnalysis', 'GaussianNB',
            'KNeighborsClassifier', 'SVC', 'Ensemble'
        ]):
        scores = cross_val_score(clf, df_res, y, cv=5, scoring='roc_auc')
        print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" %
              (scores.mean(), scores.std(), label))
    return eclf
Exemplo n.º 8
0
    def norm_data(self, data, algorithm='norm'):
        #数据规范化,dt:2d
        dt = np.array(list(data))
        shape = np.shape(dt)
        _custom = True if algorithm[0:2] == 'u-' else False

        if _custom == False and len(shape) <= 1:
            raise ValueError('query 2d data')

        if algorithm == 'norm':
            scaler = preprocessing.normalize
        elif algorithm == 'max-min':
            # 最大最小归一化
            scaler = preprocessing.MinMaxScaler()
        elif algorithm == 'qt':
            scaler = preprocessing.QuantileTransformer()
        elif algorithm == 'max':
            # 最大绝对值归一化
            scaler = preprocessing.MaxAbsScaler()
        elif algorithm == 'stand':
            # 减均值,比方差
            scaler = preprocessing.StandardScaler()
        elif algorithm == 'u-max':
            # 自定义,最大规范化,规范到-1~1之间,这里应该改为求dt中每个值的绝对值的最大值
            _max = abs(np.max(dt))
            _res = dt / _max
        elif algorithm == 'u-max-min':
            # 自定义最大最小归一化,规范到0~1之间
            _max = np.max(dt)
            _min = np.min(dt)
            _res = (dt - _min) / (_max - _min)
        elif algorithm == 'u-stand':
            # 自定义中心标准化,适合数据稳定,变化不大的情况。
            _mean = np.mean(dt)
            _var = np.std(dt)
            _res = (dt - _mean) / _var
        elif algorithm == 'decimal':
            # 自定义小数规范化,规范到0~1之间
            _q = np.log10(dt.max())
            _res = dt / np.power(10, _q)

        if _custom == True:
            return _res
        else:
            take_data = scaler.fit_transform(dt)
            return take_data
Exemplo n.º 9
0
def scaleFeatures(data, opt='standard', **kwargs):
    from sklearn import preprocessing
    if opt == 'standard':
        scl = preprocessing.StandardScaler(**kwargs)
    elif opt == 'robust':
        scl = preprocessing.RobustScaler(**kwargs)
    elif opt == 'minmax':
        scl = preprocessing.MinMaxScaler(**kwargs)
    elif opt == 'norm':
        scl = preprocessing.Normalizer(**kwargs)
    elif opt == 'gaussian':  # doesn't work! no idea why
        scl = preprocessing.PowerTransformer(method='yeo-johnson')
    elif opt == 'quantile':
        scl = preprocessing.QuantileTransformer(output_distribution='normal')
    out = pd.DataFrame(scl.fit_transform(data), columns=data.columns)
    print("Features scaled using", opt, "scaling method!")
    return out
Exemplo n.º 10
0
def tuned_gradboost(loadWeights):
    pipe = Pipeline([
        ('std', preprocessing.QuantileTransformer()),
        ('gbc', GradientBoostingClassifier())  # ExtraTreesClassifier())
    ])
    param_grid = [{  # 'gbc__criterion' : ["gini"],#gini is good
        'gbc__n_estimators': [100, 200, 250],  # 1000
        'gbc__learning_rate': [0.1, 0.05, 0.01],
        #'gbc__max_depth': [2, 4, 8, 16],
        'gbc__min_samples_leaf': [1, 10],  # 100 and 200 is bad
        'gbc__min_samples_split': [10, 100, 400],
        'gbc__max_features': ["auto", 10, 7, 1]  # 0.5,0.1
    }]

    #gsGBC = GridSearchCV(pipe,param_grid = param_grid, cv=3, scoring="accuracy", n_jobs= -1, verbose = 2)

    #gsGBC.fit(dataSet.X_train, dataSet.y_train)

    model_tuner = None
    if not loadWeights or not os.path.exists('weights/' +
                                             sys._getframe().f_code.co_name +
                                             '.pkl'):
        loadWeights = False
        model_tuner = GridSearchCV(
            pipe,
            param_grid,
            cv=2,
            n_jobs=-1,
            verbose=2,
            return_train_score=True)  #cv=dataSet.ten_fold_cv
        model_tuner.fit(dataSet.X_train, dataSet.y_train)
    else:
        model_tuner = joblib.load('weights/' + sys._getframe().f_code.co_name +
                                  '.pkl')

    report_summary(model_tuner)

    if not loadWeights:
        joblib.dump(model_tuner,
                    'weights/' + sys._getframe().f_code.co_name + '.pkl',
                    compress=1)

    results = model_tuner.cv_results_

    return 0
Exemplo n.º 11
0
    def get_gaussian_data(self, x_train, x_test):
        """
        Cette méthode va venir modifier nos données brute grâce à la méthode 
        Gaussian de préprocessing de scikit learn
        
        Args:
            x_train (np.array): donn�es d'entrainement devant �tre transform�es
            x_test (np.array): donn�es de test devant �tre transform�es
            
        Returns:
            x_train_gauss (np.array): donn�es d'entrainement transform�es
            x_test_gauss (np.array): donn�es de test transform�es
        """
        quantile_transformer2 = preprocessing.QuantileTransformer(output_distribution='normal', random_state=17)
        x_train_gauss = quantile_transformer2.fit_transform(x_train)
        x_test_gauss = quantile_transformer2.fit_transform(x_test)

        return x_train_gauss, x_test_gauss
Exemplo n.º 12
0
    def get_quantile_data(self, x_train, x_test):
        """
        Cette méthode va venir modifier nos données brute grâce à la méthode 
        quantile de préprocessing de scikit learn
        
        Args:
            x_train (np.array): donn�es d'entrainement devant �tre transform�es
            x_test (np.array): donn�es de test devant �tre transform�es
            
        Returns:
            x_train_trans (np.array): donn�es d'entrainement transform�es
            x_test_trans (np.array): donn�es de test transform�es
        """
        quantile_transformer = preprocessing.QuantileTransformer(random_state=17)
        x_train_trans = quantile_transformer.fit_transform(x_train)
        x_test_trans = quantile_transformer.fit_transform(x_test)

        return x_train_trans, x_test_trans
Exemplo n.º 13
0
def train_drfs(train_x, train_y, eps=0.5, threshold="median"):
    n_samples, n_features, n_classes = \
            get_counts_tt(train_x, train_y)

    # pick number of components
    min_comp = random_projection.johnson_lindenstrauss_min_dim( \
            n_samples=n_samples, eps=eps)
    min_comp = min(min_comp, n_features)

    # scale and agglomerate to min_comp
    #scaler = preprocessing.StandardScaler()
    scaler = preprocessing.QuantileTransformer()
    feat_agg = cluster.FeatureAgglomeration( \
            n_clusters=min_comp)
    xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
    scaler2 = preprocessing.RobustScaler()
    #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)

    # train the model pipeline
    dr_pipe = pipeline.Pipeline([('scaler', scaler), \
            ('feat_agg', feat_agg), ('scaler2', scaler2)])

    dr_pipe.fit(train_x)

    # transform train_x to train xtc
    train_x = dr_pipe.transform(train_x)
    # train the xtc
    xtc.fit(train_x, train_y)

    print("Feature importances:")
    print("\tMax:", max(xtc.feature_importances_))
    print("\tMin:", min(xtc.feature_importances_))
    #print(xtc.feature_importances_)

    # create the feature selection model from the xtc
    feat_sel = feature_selection.SelectFromModel( \
            xtc, prefit=True, threshold=threshold)

    # create the pipeline to reduce dim then feature select
    drfs_pipe = pipeline.Pipeline(\
            [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)])

    return drfs_pipe
    def scaler(self, method='yeo-johnson'):
        '''
        Scale data to gaussian distribution N(0,1)

        Parameters
        ----------
        column_name : string
            Name of the column to scale data.
        method : string, optional
            Method to use for scaling transformation. The default is 'yeo-johnson'.

        Returns
        -------
        dataframe : DataFrame
            Return updated dataframe of the missing data from the column.
        scaler: object
            scaler created with the data.
            
        '''

        if method == 'standard': scaler = preprocessing.StandardScaler()
        if method == 'minmax': scaler = preprocessing.MinMaxScaler()
        if method == 'maxabs': scaler = preprocessing.MaxAbsScaler()
        if method == 'robust': scaler = preprocessing.RobustScaler()
        if method == 'quantile':
            scaler = preprocessing.QuantileTransformer(
                output_distribution='normal')

        if method == 'l1': scaler = preprocessing.normalize(method)
        if method == 'l2': scaler = preprocessing.normalize(method)
        if method == 'max': scaler = preprocessing.normalize(method)

        feature_sign = self._check_sign_feature()
        if method == 'box-cox' or feature_sign == 'positive':
            scaler = preprocessing.PowerTransformer(method)
        if method == 'yeo-johnson' or feature_sign == 'negative':
            scaler = preprocessing.PowerTransformer(method)

        scaler.fit(self.dataframe)
        self.dataframe = scaler.transform(self.dataframe)

        return self.dataframe, scaler
Exemplo n.º 15
0
def fit_scaler(sample,
               n_dims,
               scaler_out,
               reshape=True,
               scaler_type='QuantileTransformer'):
    print('Fitting quantile transform', end=' ', flush=True)
    start_time = time.time()
    if reshape: sample = np.reshape(sample, (-1, n_dims))
    if scaler_type == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer(
            output_distribution='uniform', n_quantiles=10000, random_state=0)
    if scaler_type == 'MaxAbsScaler':
        scaler = preprocessing.MaxAbsScaler()
    if scaler_type == 'RobustScaler':
        scaler = preprocessing.RobustScaler()
    scaler.fit(sample)
    print('(', '\b' + format(time.time() - start_time, '2.1f'), '\b' + ' s)')
    print('Saving scaling to', scaler_out)
    pickle.dump(scaler, open(scaler_out, 'wb'))
    return scaler
Exemplo n.º 16
0
def hard_voting(df_res, y):

    print('HARD VOTING')

    quantile = preprocessing.QuantileTransformer()
    df_res = quantile.fit_transform(df_res)

    clf1 = ensemble.AdaBoostClassifier()
    clf2 = MLPClassifier(
    )  #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1)
    clf3 = ensemble.GradientBoostingClassifier(
    )  #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11)
    clf4 = SGDClassifier(
        loss='log', max_iter=1000
    )  #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log')
    clf5 = LogisticRegression()
    clf6 = LogisticRegressionCV()
    clf7 = QuadraticDiscriminantAnalysis()
    clf8 = GaussianNB()
    clf9 = LinearDiscriminantAnalysis()
    eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2),
                                        ('gbs', clf3), ('sgdc', clf4),
                                        ('lgr', clf5), ('lrcv', clf6),
                                        ('qda', clf7), ('gnb', clf8),
                                        ('lda', clf9)],
                            voting='hard')

    #    eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2),
    #                                        ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6)], voting='hard')

    for clf, label in zip(
        [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, eclf], [
            'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting',
            'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV',
            'QuadraticDiscriminantAnalysis', 'GaussianNB',
            'LinearDiscriminantAnalysis', 'Ensemble'
        ]):
        scores = cross_val_score(clf, df_res, y, cv=5,
                                 scoring='roc_auc')  #, scoring='roc_auc'
        print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" %
              (scores.mean(), scores.std(), label))
Exemplo n.º 17
0
def transformation(X, method=1, powerMet='yeo-johnson'):
    """Power transformation

    Args:
        X (float): Input data
        method (int, optional): 0:QuantileTransformer,1:Power Transformer. Defaults to 1.
        powerMet (str, optional): It is essential for method 1. Not included in method 0. Defaults to 'yeo-johnson'.

    Returns:
        [float]: X_tr transformed version of X.
        [transformer]: transformer with the estimated values from the input X
    """
    if method == 0:
        scaler = preprocessing.QuantileTransformer(random_state=0)
        X_tr = scaler.fit_transform(X)
    elif method == 1:
        scaler = preprocessing.PowerTransformer(method=powerMet,
                                                standardize=False)
        X_tr = scaler.fit_transform(X)
    else:
        X_tr = X
    return X_tr, scaler
Exemplo n.º 18
0
    def applyScale(self):

        if self.optionScale == 0:
            pass            
        elif self.optionScale == 1:#quick scale
            self.dataSet = preprocessing.scale(self.dataSet)

        elif self.optionScale == 2:#standar scale
            scaler = preprocessing.StandardScaler().fit(self.dataSet)
            self.dataSet = scaler.transform(self.dataSet)

        elif self.optionScale == 3:#min max scaler
            min_max_scaler = preprocessing.MinMaxScaler()
            self.dataSet = min_max_scaler.fit_transform(self.dataSet)

        elif self.optionScale == 4:#quantile transformation
            quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
            self.dataSet = quantile_transformer.fit_transform(self.dataSet)

        else:#powerTransformation
            pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
            self.dataSet = pt.fit_transform(self.dataSet)
Exemplo n.º 19
0
 def process_data(self, data):
     if type(self.features) == list:
         self.return_features = [x + "_normalized" for x in self.features]
         for feature in self.features:
             return_feature = feature + "_normalized"
             if feature == "peak_width":
                 peak_width_min = data[feature].min()
                 peak_width_max = data[feature].max()
                 data[return_feature] = (data[[feature]].values -
                                         peak_width_min) / (peak_width_max -
                                                            peak_width_min)
             elif "percentage" in feature:
                 data[return_feature] = data[[feature]].values
             elif ('motif' in feature) and (self.method == "knn"):
                 data[return_feature] = data[[feature]].values / (50**0.5)
             else:
                 data[return_feature] = preprocessing.QuantileTransformer(
                 ).fit_transform(data[[feature]].values)
     else:
         print("feature must be list type")
         q()
     return data
def soft_voting(df_res):

    #    min_max_scaler = preprocessing.MinMaxScaler()
    #    df_res = min_max_scaler.fit_transform(df_res)

    #    robust_scaler = preprocessing.RobustScaler()
    #    df_res = robust_scaler.fit_transform(df_res)

    quantile = preprocessing.QuantileTransformer()
    df_res = quantile.fit_transform(df_res)

    clf1 = ensemble.AdaBoostClassifier()
    clf2 = MLPClassifier(
    )  #AdaBoostClassifier()#ensemble.RandomForestClassifier(n_estimators=200, random_state=11,n_jobs=-1)
    clf3 = ensemble.GradientBoostingClassifier(
    )  #ensemble.GradientBoostingClassifier(n_estimators=3000, learning_rate=1.1, max_depth=5, random_state=11)
    clf4 = SGDClassifier(
        loss='log', max_iter=1000
    )  #SGDClassifier(max_iter=35000, tol=1e-4, shuffle=True, penalty='l2', loss='log')
    clf5 = LogisticRegression()
    clf6 = LogisticRegressionCV()
    eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2),
                                        ('gbs', clf3), ('sgdc', clf4),
                                        ('lgr', clf5), ('lrcv', clf6)],
                            voting='soft',
                            weights=[1, 1, 1, 1, 1, 1])

    #    eclf = VotingClassifier(estimators=[('ada', clf1), ('mlpc', clf2),
    #                                        ('gbs', clf3), ('sgdc', clf4), ('lgr', clf5), ('lrcv', clf6)], voting='hard')

    for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, eclf], [
            'AdaBoostClassifier', 'MLPClassifier', 'GradientBoosting',
            'SGDClassifier', 'LogisticRegression', 'LogisticRegressionCV',
            'Ensemble'
    ]):
        scores = cross_val_score(clf, df_res, y, cv=5)  #, scoring='roc_auc'
        print("ROC_AUC scoring: %0.5f (+/- %0.5f) [%s]" %
              (scores.mean(), scores.std(), label))
Exemplo n.º 21
0
def preprocess(x_train: numpy.ndarray, y_train: numpy.ndarray, x_test: numpy.ndarray) \
        -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, list]:
    """
    Preprocess the data:
        Symmetrize x_train and y_train.

        Scale x_train and x_test using Quantile Transformer.

        Apply PCA keeping all the information.

    :param x_train: the train features.
    :param y_train: the train labels.
    :param x_test: the test features.
    :return: preprocessed x_train, y_train, x_test and pca.components_ used.
    """
    logger.log('Preprocessing...')

    logger.log('\tSymmetrize training dataset...')
    x_train, y_train = helpers.preprocessing.symmetrize_dataset(
        x_train, y_train)
    logger.log('\t' + str(len(y_train)) + ' training data remained')

    logger.log('\tScaling data using Quantile Transformer with params:')
    scaler = preprocessing.QuantileTransformer(random_state=0)
    logger.log('\t' + str(scaler.get_params()))
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    logger.log('\tApplying Principal Component Analysis with params:')
    # Keep all the information.
    pca = PCA(whiten=True, random_state=0)
    logger.log('\t' + str(pca.get_params()))
    pca.fit(x_train)
    x_train = pca.transform(x_train)
    x_test = pca.transform(x_test)

    return x_train, y_train, x_test, pca.components_
Exemplo n.º 22
0
def normalize_data(train_data, test_data,validation_1,validation_2, type=None):
    scaler = None
    if type == 'standard':
        scaler = preprocessing.StandardScaler()
    elif type == 'min_max':
        scaler = preprocessing.MinMaxScaler()
    elif type == 'l1':
        scaler = preprocessing.Normalizer('l1')
    elif type == 'l2':
        scaler = preprocessing.Normalizer('l2')
    elif type == 'quantile_normal':
        scaler = preprocessing.QuantileTransformer(output_distribution='normal')

    if scaler is not None:
        scaler.fit(train_data)
        train_data = scaler.transform(train_data)
        test_data = scaler.transform(test_data)
        validation_1 = scaler.transform(validation_1)
        validation_2 = scaler.transform(validation_2)
    else:
        print_out(f,"Invalid scaling method - no scaling has been done")

    return train_data, test_data, validation_1, validation_2
Exemplo n.º 23
0
def standardization(X_train, X_test=[], mode='zscore', scaler=None):
    new_X_test = []
    if scaler is None:
        if mode == 'zscore':
            scaler = preprocessing.StandardScaler().fit(X_train)
            new_X_train = scaler.transform(X_train)
            if X_test != []:
                new_X_test = scaler.transform(X_test)
        elif mode == 'minmax':
            scaler = preprocessing.MinMaxScaler().fit(X_train)
            new_X_train = scaler.transform(X_train)
            if X_test != []:
                new_X_test = scaler.transform(X_test)
        elif mode == 'quantile':
            scaler = preprocessing.QuantileTransformer(
                output_distribution='normal').fit(X_train)
            new_X_train = scaler.transform(X_train)
            if X_test != []:
                new_X_test = scaler.transform(X_test)
        elif mode == 'normalize':
            new_X_train = preprocessing.normalize(X_train)
            if X_test != []:
                new_X_test = scaler.transform(X_test)
            scaler = None
        else:
            print("'mode = %s' is not defined" % (mode))
            new_X_train = X_train
            new_X_test = X_test
            scaler = None

    else:
        new_X_train = scaler.transform(X_train)
        if X_test != []:
            new_X_test = scaler.transform(X_test)
    new_X_train = new_X_train.astype(np.float32)

    return new_X_train, new_X_test, scaler
Exemplo n.º 24
0
    def test_transform_ct_1(self):
        """
        Unit test for apply_preprocessing on ColumnTransformer with drop option and sklearn encoder.
        """
        y = pd.DataFrame(data=[0, 1], columns=['y'])

        train = pd.DataFrame({
            'num1': [0, 1],
            'num2': [0, 2],
            'other': ['A', 'B']
        })

        enc = ColumnTransformer(transformers=[
            ('power', skp.QuantileTransformer(n_quantiles=2), ['num1', 'num2'])
        ],
                                remainder='drop')
        enc.fit(train, y)

        train_preprocessed = pd.DataFrame(enc.transform(train))

        clf = cb.CatBoostClassifier(n_estimators=1).fit(train_preprocessed, y)

        test = pd.DataFrame({
            'num1': [0, 1, 1],
            'num2': [0, 2, 3],
            'other': ['A', 'B', 'C']
        })

        expected = pd.DataFrame(enc.transform(test))
        result = apply_preprocessing(test, clf, enc)
        assert result.shape == expected.shape
        assert [column in clf.feature_names_ for column in result.columns]
        assert all(expected.index == result.index)
        assert all([
            str(type_result) == str(expected.dtypes[index])
            for index, type_result in enumerate(result.dtypes)
        ])
Exemplo n.º 25
0
def feature_selection(df_res, y, file_number):

    feature_names = df_res.columns

    #    min_max_scaler = preprocessing.MinMaxScaler()
    #    df_res = min_max_scaler.fit_transform(df_res)
    quantile = preprocessing.QuantileTransformer()
    df_res = quantile.fit_transform(df_res)

    X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(
        df_res, y, test_size=0.25, random_state=11, shuffle=True)

    gbc = ensemble.GradientBoostingClassifier(
    )  #n_estimators=100, random_state=11, n_jobs=-1
    gbc.fit(X_res_train, y_res_train)
    err_train = np.mean(y_res_train != gbc.predict(X_res_train))
    err_test = np.mean(y_res_test != gbc.predict(X_res_test))
    print(err_train, err_test)
    scores = cross_val_score(gbc, df_res, y, cv=5, scoring='roc_auc')
    #    print(scores)
    print("ROC_AUC GradientBoostingClassifier: %0.2f (+/- %0.5f)" %
          (scores.mean(), scores.std()))

    #    feature_names = df_res.columns
    importances = gbc.feature_importances_
    indices = np.argsort(importances)[::-1]

    low_cost_features = list()
    with open('res_' + str(file_number) + '.txt', 'w') as f:
        with redirect_stdout(f):
            #            print("Feature importances:")
            for f, idx in enumerate(indices):
                #                print("{:2d}. feature '{:5s}' ({:.12f})".format(f + 1, feature_names[idx], importances[idx]))
                if importances[idx] == 0:
                    low_cost_features.append(feature_names[idx])
    print('Кол-во пустых фич: ', len(low_cost_features))
    return low_cost_features
Exemplo n.º 26
0
def QuantileTransformer(train_df, test_df, HP):
    n_quantiles, output_distribution, ignore_implicit_zeros, subsample, copy = HP['QuantileTransformer']['n_quantiles'], \
                                                                               HP['QuantileTransformer'][
                                                                                   'output_distribution'], \
                                                                               HP['QuantileTransformer'][
                                                                                   'ignore_implicit_zeros'], \
                                                                               HP['QuantileTransformer']['subsample'], \
                                                                               HP['QuantileTransformer']['copy']

    train_x = train_df.iloc[:, :-1]
    train_y = train_df.iloc[:, -1:]
    test_x = test_df.iloc[:, :-1]
    test_y = test_df.iloc[:, -1:]

    transformer = preprocessing.QuantileTransformer(
        n_quantiles=n_quantiles,
        output_distribution=output_distribution,
        ignore_implicit_zeros=ignore_implicit_zeros,
        copy=copy)
    train_x_copy = train_x.copy()
    train_x_transformed = transformer.fit_transform(train_x_copy)
    test_x_copy = test_x.copy()
    test_x_transformed = transformer.transform(test_x_copy)  # TODO check here

    train_column_name = list(train_x_copy.columns)
    test_column_name = list(test_x_copy.columns)

    train_x_transformed_df = pd.DataFrame(train_x_transformed)
    train_x_transformed_df.columns = train_column_name
    train_df_transformed = train_x_transformed_df.assign(label=train_y.values)

    test_x_transformed_df = pd.DataFrame(test_x_transformed)
    test_x_transformed_df.columns = test_column_name
    test_df_transformed = test_x_transformed_df.assign(label=test_y.values)

    return train_df_transformed, test_df_transformed
Exemplo n.º 27
0
def quantile_norm(df):
    """
    Wrapper for sklearn's preprocessing.QuantileTransformer.

    Parameters
    ----------
    df : DataFrame


    Returns
    -------
    DataFrame
        QuantileTransformer. transformed

    Notes
    -----
    Outer bounds are very low probability regions of the normal distribution so
    min and max are approximately -5 and +5 standard deviations away from mean,
    which limits the utility of this transform.
    """
    assert (isinstance(df, pd.DataFrame))
    qt = preprocessing.QuantileTransformer(output_distribution='normal',
                                           random_state=0)
    return pd.DataFrame(qt.fit_transform(df))
Exemplo n.º 28
0
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
grid.fit(X_train, y_train)
print(grid.score(X_test, y_test))
print(grid.best_estimator_.score(X_test, y_test))  # same result
print(grid.best_params_)

# https://iaml.it/blog/optimizing-sklearn-pipelines
from sklearn import feature_selection
from sklearn.linear_model import Ridge
n_features_to_test = np.arange(1, 3)
alpha_to_test = 2.0**np.arange(-6, +6)
scalers_to_test = [
    preprocessing.StandardScaler(),
    preprocessing.RobustScaler(),
    preprocessing.QuantileTransformer()
]

params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [decomposition.PCA()], # Parameter of the parameter
         'reduce_dim__n_components': n_features_to_test,
         'regressor__alpha': alpha_to_test}, # Parameter of the parameter

        {'scaler': scalers_to_test,
         'reduce_dim': [feature_selection.SelectKBest(feature_selection.f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

pipe = Pipeline([('scaler', preprocessing.StandardScaler()),
Exemplo n.º 29
0
    def preparation(self, trips, territorial, users, changes=0, binary=0):
        if changes:
            trips = pd.read_excel(trips, index_col=0)
            users = pd.read_excel(users, index_col=4)
            users = users.drop(columns=['Unnamed: 0'])
            terr = pd.read_excel(territorial, index_col=1)
            terr = terr.drop(columns=['Unnamed: 0'])
            terr = terr.fillna(terr.mean())

            df = []
            test = []
            modes = trips['mode'].unique()
            modes = dict({(modes[i], i) for i in range(len(modes))})

            for row, col in trips.iterrows():
                user_id = col['user_id']
                tmp_obj = {}
                d_census_id = col['d_census_id']
                try:
                    territorial_info = terr.loc[int(d_census_id)]
                    user_info = users.loc[user_id]

                    # for i in self.territorial_features:
                    for i in terr.columns:
                        tmp_obj[i] = territorial_info[i]
                    for i in self.trip_features:
                        tmp_obj[i] = col[i]
                    for i in user_info.index:
                        if i == 'Row':
                            continue
                        tmp_obj[i] = user_info[i]
                    format_date = "%Y-%m-%d %H:%M:%S"
                    o_d = datetime.datetime.strptime(
                        tmp_obj['o_datetime'],
                        format_date) + datetime.timedelta(hours=1)
                    d_d = datetime.datetime.strptime(
                        tmp_obj['d_datetime'],
                        format_date) + datetime.timedelta(hours=1)
                    tmp_obj['o_datetime'] = self.date_to_cat(o_d)
                    tmp_obj['d_datetime'] = self.date_to_cat(d_d)
                    tmp_obj['mode'] = modes[tmp_obj['mode']]
                    if tmp_obj['category'] == 'helth':
                        tmp_obj['category'] = 'health'
                    elif tmp_obj['category'] == 'admni_chores':
                        tmp_obj['category'] = 'admin_chores'
                    tmp_obj['category_label'] = tmp_obj['category']
                    tmp_obj['category'] = self.categories[tmp_obj['category']]
                    tmp_obj['activity_time'] = self.activity_to_cat(
                        tmp_obj['activity_time'])
                    tmp_obj['occupation'] = self.occup[tmp_obj['occupation']]

                    if tmp_obj['category_label'] == 'nan' or tmp_obj[
                            'category_label'] == 'NONE':
                        test.append(tmp_obj)
                    else:
                        df.append(tmp_obj)
                except:
                    # pass
                    traceback.print_exc()

            self.df = pd.DataFrame(df)
            self.test = pd.DataFrame(test)

            self.df.to_excel('data/CompleteDataframe_AllTerritorial.xlsx',
                             index=False)
            self.test.to_excel('data/ToLabel_AllTerritorial.xlsx', index=False)
        else:
            self.df = pd.read_excel('data/df.xlsx')
            self.to_label = pd.read_excel('data/ToLabel_AllTerritorial.xlsx')
            print('Dataset Loaded')

        self.df = self.df.sample(frac=1)
        lb = [
            'eating', 'entertainment', 'shopping', 'commuting', 'recreation',
            'health', 'travel', 'home', 'work', 'education', 'religious',
            'police', 'admin_chores'
        ]

        # for row, col in self.df.iterrows():
        # 	if col['category_label'] not in lb:
        # 		self.df = self.df.drop(row)

        if not binary:
            self.labels = {
                'shopping': 3,
                'health': 5,
                'home': 10,
                'work': 9,
                'entertainment': 1,
                'commuting': 0,
                'recreation': 7,
                'education': 2,
                'eating': 4,
                'travel': 6,
                'admin_chores': 8,
                'police': 12,
                'religious': 11
            }
        else:
            self.df = self.df.apply(self.binarization_apply, axis=1)
            self.labels = {
                'systematic (home,work,education)': 0,
                'non-systematic': 1
            }

        target = preprocessing.OneHotEncoder().fit_transform(
            self.df['category'].values.reshape(-1, 1))
        df_train = self.df.drop(columns=['category', 'category_label'])
        self.ct = ColumnTransformer(
            [
                ('oh', preprocessing.OneHotEncoder(), [
                    'activity_time', 'mode', 'd_datetime', 'o_datetime',
                    'occupation', 'gender', 'bin_weekday', 'bin_category'
                ]),
                (
                    'qt',
                    preprocessing.QuantileTransformer(
                        output_distribution='normal'),
                    [
                        'home',
                        'work',
                        'eating',
                        'entertainment',
                        'recreation',
                        'shopping',
                        'travel',
                        'admin_chores',
                        'religious',
                        'health',
                        'police',
                        'education',
                        'age',
                        # 'P_TOT',
                        # 'MALE_TOT','FEM_TOT',
                        'age 25-39',
                        'age 40-64',
                        'age >65',
                        'age 10-24',
                        'P47',
                        'P48',
                        'P49',
                        'P61',
                        'P62',
                        # 'INCOME'
                    ]),
                # ('mm', preprocessing.MinMaxScaler(), ['P61'])
            ],
            # remainder='passthrough'
        )
        df_train = df_train.fillna(df_train.mean())
        self.sc_fit = self.ct.fit(df_train)
        data = self.sc_fit.transform(df_train)
        return data, target
Exemplo n.º 30
0
import matplotlib
get_ipython().magic('matplotlib inline')
model_data_l.cluster.value_counts().plot(kind = 'pie') 


# ### 4.2 k-means聚类的第二种方式:进行变量分布的正态转换--用于客户细分

# - 1、进行变量分布的正态转换

# In[19]:


import numpy as np
from sklearn import preprocessing
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
fa_scores_trans=quantile_transformer.fit_transform(fa_scores)
fa_scores_trans=pd.DataFrame(fa_scores_trans)
fa_scores_trans=fa_scores_trans.rename(columns={0: "ATM_POS", 1: "TBM", 2: "CSC"})
fa_scores_trans.head()


# In[20]:


var = ["ATM_POS","TBM","CSC"]
skew_var = {}
for i in var:
    skew_var[i]=abs(fa_scores_trans[i].skew())
    skew=pd.Series(skew_var).sort_values(ascending=False)
skew