예제 #1
0
class QuantileTransformerImpl():
    def __init__(self,
                 n_quantiles=1000,
                 output_distribution='uniform',
                 ignore_implicit_zeros=False,
                 subsample=100000,
                 random_state=None,
                 copy=True):
        self._hyperparams = {
            'n_quantiles': n_quantiles,
            'output_distribution': output_distribution,
            'ignore_implicit_zeros': ignore_implicit_zeros,
            'subsample': subsample,
            'random_state': random_state,
            'copy': copy
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
def fit_model(model, scaler='quantileNormal'):
	#Usar todos os dados para garantir a representacao de todos os genres
	X, y, genre_mapping = loadFeatures_NoSplit()

	if scaler=='standard':
		norm = StandardScaler()
	elif scaler=='minMax':
		norm = MinMaxScaler()
	elif scaler=='maxAbs':
		norm = MaxAbsScaler()
	elif scaler=='robust':
		norm = RobustScaler(quantile_range=(25, 75))
	elif scaler=='quantileUniform':
		norm = QuantileTransformer(output_distribution='uniform')
	elif scaler=='quantileNormal':
		norm = QuantileTransformer(output_distribution='normal')
	elif scaler=='L2Norm':
		norm = Normalizer()

	if model == 'RF':
		classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features=1, min_samples_leaf=6, min_samples_split=5)
	elif model == 'SVM':
		classifier = SVC(kernel='linear', gamma=0.15043359874882642, C=37.04299991666622)
	elif model == 'LR':
		classifier = LogisticRegression(solver='lbfgs', penalty='l2', C=12.860378815138466, multi_class='multinomial', warm_start=True)
	elif model == 'kNN':
		classifier = KNeighborsClassifier(algorithm='ball_tree', leaf_size=31, n_neighbors=29)
	elif model == 'MLP':
		classifier = MLPClassifier(activation='relu', alpha=0.05440165708835292, batch_size=358, solver='adam')
	elif model == 'SGD':
		classifier = SGDClassifier(penalty='elasticnet', loss='log', alpha=0.02011798244191186, max_iter=50)
	elif model == 'extraTrees':
		classifier = ExtraTreesClassifier(n_estimators=50, criterion='gini', max_features=3, min_samples_leaf=5, min_samples_split=8)
	else:
		raise ValueError('Classifier not recognized {}.'.format(model))

	#Usar e guardar o pipeline para aplicar os mesmos passos aos novos dados de teste
	pipe = Pipeline([('normalize', norm), ('reduce_dim', LDA()),('classify', classifier)])

	pipe.fit(X, y)

	#Previsoes com dados novos
	# score = pipe.score(X,y)
	# features = pd.read_csv('sampleFeatures.csv', index_col=0, header=[0, 1, 2])
	# X = features.iloc[:, 0:518].values
	# predict = pipe.predict(X)
	# print("Score is %s and Prediction is %s with model %s" % (score, predict, model))

	joblib.dump(pipe, 'final_model.pkl')
	result = "%s Model using %s scaler" % (model, scaler)
	return result
예제 #3
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
예제 #4
0
 def __init__(self, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=100000, random_state=None, copy=True):
     self._hyperparams = {
         'n_quantiles': n_quantiles,
         'output_distribution': output_distribution,
         'ignore_implicit_zeros': ignore_implicit_zeros,
         'subsample': subsample,
         'random_state': random_state,
         'copy': copy}
     self._wrapped_model = Op(**self._hyperparams)
def normalize(X, scaler='quantileNormal'):
	if scaler=='standard':
		X = StandardScaler().fit_transform(X)
	elif scaler=='minMax':
		X = MinMaxScaler().fit_transform(X)
	elif scaler=='maxAbs':
		X = MaxAbsScaler().fit_transform(X)
	elif scaler=='robust':
		X = RobustScaler(quantile_range=(25, 75)).fit_transform(X)
	elif scaler=='quantileUniform':
		X = QuantileTransformer(output_distribution='uniform').fit_transform(X)
	elif scaler=='quantileNormal':
		X = QuantileTransformer(output_distribution='normal').fit_transform(X)
	elif scaler=='L2Norm':
		X = Normalizer().fit_transform(X)
	else:
		raise ValueError('Scaler is not defined: %s' % scaler)

	return X
    def quantile_scale_map(self,
                           col_need,
                           distribution='uniform',
                           drop_origin_col=False):
        """

        :param col_need:
        :param distribution: 'uniform' (default) or 'normal'
        :param drop_origin_col:
        :return:
        """
        self.quantile_transform = QuantileTransformer(
            output_distribution=distribution)

        array_quantile = self.quantile_transform.fit_transform(
            self.df.loc[:, col_need])
        self._scale_map(array=array_quantile,
                        column_name=col_need,
                        suffix="_q{}Map".format(distribution.capitalize()),
                        drop_origin_columns=drop_origin_col)
예제 #7
0
def main():
    X, y = load_wine()

    distributions = [
        #('Unscaled data', X),
        #('Data after standard scaling', StandardScaler().fit_transform(X)),
        #('Data after min-max scaling', MinMaxScaler().fit_transform(X)),
        #('Data after max-abs scaling', MaxAbsScaler().fit_transform(X)),
        #('Data after robust scaling', RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
        ('Data after quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform').fit_transform(X)),
        #('Data after quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)),
        #('Data after sample-wise L2 normalizing', Normalizer().fit_transform(X)) ## this one sucked
    ]

    for label, scaled_X in distributions:
        print '========================'
        print label
        print '========================'
        scaled_X_df = pd.DataFrame(scaled_X, index=X.index, columns=X.columns)
        machine_learn(scaled_X_df, y)
        print '========================'
        print ''

    make_plots(scores)
예제 #8
0
    def getClassWeights(self, weightType, dataSet=None):

        if not weightType in [
                "Freq", "MedianFreq", "1x", "2x", "division", "relativeToMin",
                "quantile"
        ]:
            raise ValueError(
                "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]"
            )

        # get class weights because of imbalanced dataset (e.g. a lot of road and buildings)
        print("Calculate class ", weightType, " weights...")

        # only calculate the weights from a specific split of the dataset. For performance reasons
        # PART = 1 would be the total dataset
        PART = 10
        classCount = np.zeros(self.config["classes"])
        # count all the classes in every given mask image
        for i in range(int(self.config["trainSize"] / PART)):
            labelImg = self.getImage(i, "trainLabel").flatten()
            labelClassCount = np.bincount(labelImg,
                                          minlength=self.config["classes"])
            classCount += labelClassCount

            if i % int(1000 / PART) == 0:
                print("Label image ", i, "/", self.config["trainSize"] / PART)

        print("Class count: ", classCount.shape, classCount)

        #choose class weights type
        #Frequency
        if weightType == "Freq":
            classWeights = np.median(classCount) / classCount
        #Median Frequency
        elif weightType == "MedianFreq":
            classWeights = np.median(np.median(classCount) / classCount) / (
                np.median(classCount) / classCount)
        # Simple Total/ClassCount
        elif weightType == "1x":
            classWeights = 1 - (classCount / classCount.sum() * 1)
        # Simple Total/ClassCount doubled effect
        elif weightType == "2x":
            classWeights = 1 - (classCount / classCount.sum() * 2)
        # Simple Total/ClassCount divided by Minimum
        elif weightType == "division":
            classWeights = classCount.sum() / classCount
            #divide with minimum
            classWeights[classWeights == 1] = 999999
            classWeights /= classWeights.min()
        # all weights are relative to the smallest class which is assigned the 1.0. Minimal assigned value is 0.1
        elif weightType == "relativeToMin":
            classWeights = classCount.min() / classCount
            print("Class weights: ", classWeights.shape, classWeights)
            classWeights[(classWeights < 0.1)] *= 10
        # using the quantile transformer of sklearn all the weights are distributed in 0-0.9999. Minimal assigned value is 0.1
        elif weightType == "quantile":
            from sklearn.preprocessing.data import QuantileTransformer
            _scaler = QuantileTransformer()
            classCount = np.expand_dims(classCount, axis=1)
            classWeights = _scaler.fit_transform(classCount)
            classWeights = np.around(classWeights, decimals=4)
            classWeights = np.squeeze(classWeights)
            classWeights = 1 - classWeights
            classWeights[(classWeights < 0.1)] = 0.1

        else:
            raise ValueError(
                "Wrong weights calc type given! Valid arguments are [Freq, MedianFreq, 1x, 2x, division, relativeToMin, quantile]"
            )

        # eliminate inf values
        classWeights[(classWeights == np.inf)] = 1
        print("Class weights: ", classWeights.shape, classWeights)
        np.save(
            "classWeights" + str(self.config["x"]) + str(self.config["y"]) +
            self.config["name"], classWeights)
예제 #9
0
# Feature 5 has a few but very large outliers.

X = X_full[:, [0, 5]]

distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling',
        StandardScaler().fit_transform(X)),
    ('Data after min-max scaling',
        MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling',
        MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
        QuantileTransformer(output_distribution='uniform')
        .fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
        QuantileTransformer(output_distribution='normal')
        .fit_transform(X)),
    ('Data after sample-wise L2 normalizing',
        Normalizer().fit_transform(X))
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_full)


def create_axes(title, figsize=(16, 6)):
    fig = plt.figure(figsize=figsize)
    fig.suptitle(title)
예제 #10
0
for i1 in y1:
    a1 = np.binary_repr(i1, 7)
    ans1.append(a1)
df1['ans1'] = pd.DataFrame(ans1)

for i1 in range(7):
    df['BS1' + str(i1)] = df1['ans1'].str[i1]

# remove original style name now
df.drop(['Style Name'], 1, inplace=True)
df = df.dropna(axis=0)

c = df.columns[df.dtypes.eq(object)]
df[c] = df[c].apply(pd.to_numeric, errors='coerce', axis=0)

scaler = QuantileTransformer()
#df3 = scaler.fit_transform(df)

X5 = np.array(df.drop(['Score'], 1))
y5 = np.array(df['Score'])

X3 = scaler.fit_transform(pd.DataFrame(X5))
#y3 = scaler.fit_transform(pd.DataFrame(y5))
y3 = 1.2 - np.log(y5)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X5, y5, test_size=0.20)

# Data Munging Compleated.######################

from keras import layers, models
class FeatureMap(object):
    def __init__(self, df):
        self.df = copy.deepcopy(df)
        self.onehot = None
        self.label_code = None
        self.col_label_dict = dict()
        self.min_max_scale = None
        self.max_abs_scale = None
        self.standard_scale = None
        self.robust_scale = None
        self.quantile_transform = None

    def log_map(self, col_need, col_replace=True):
        df_need = self.df[col_need]
        if col_replace:
            self.df[col_need] = df_need.apply(lambda x: np.log(x))
        else:
            col_need_extend = [col + "_log" for col in col_need]
            self.df[col_need_extend] = df_need.apply(lambda x: np.log(x))

    def box_cox_map(self, col_need, gamma=1.0, col_replace=True):
        """
        y = ((1+x)**gamma - 1) / gamma  if gamma != 0
            log(1+x)                    if gamma == 0
        ref: http://onlinestatbook.com/2/transformations/box-cox.html
        :param col_need:
        :param gamma:
        :param col_replace:
        :return:
        """
        df_need = self.df[col_need]
        if col_replace:
            self.df[col_need] = df_need.applymap(lambda x: boxcox1p(x, gamma))
        else:
            col_need_extend = [col + "_boxCox" for col in col_need]
            self.df[col_need_extend] = df_need.applymap(
                lambda x: boxcox1p(x, gamma))

    def onehot_encode(self, col_need, start_zero=True):
        """
        onehot encode DataFrame of which the columns you need
        note: the origin category should be integer in range(classes) or range(classes+1)
        :param col_need:
        :param start_zero: category is in range(classes)
        :return: new DataFrame without col_need, after onehot encoding,
                  start method is in accordance with start_zero
        """
        self.onehot = OneHotEncoder(sparse=False)
        array_onehot = self.onehot.fit_transform(self.df.loc[:, col_need])

        col_onehot = []

        for col_index in range(len(col_need)):
            if start_zero:
                for hot_index in range(self.onehot.n_values_[col_index]):
                    col_onehot.append(col_need[col_index] + str(hot_index))
            else:
                for hot_index in range(1, self.onehot.n_values_[col_index]):
                    col_onehot.append(col_need[col_index] + str(hot_index))

        self.df.drop(col_need, axis=1, inplace=True)

        df_onehot = pd.DataFrame(array_onehot,
                                 columns=col_onehot,
                                 index=self.df.index)
        self.df = pd.concat([self.df, df_onehot], axis=1)

    def label_encode(self, col_need):
        """
        onehot encode DataFrame of which the columns you need
        :param col_need: length should be 1
        :return: new DataFrame without col_need, after label encoding, start from 0
        """
        assert isinstance(col_need, list) and len(col_need) == 1
        self.label_code = LabelEncoder()
        array_label_code = self.label_code.fit_transform(self.df.loc[:,
                                                                     col_need])

        label_list = list(self.label_code.classes_)
        for i, x in enumerate(label_list):
            self.col_label_dict[col_need[0] + "_" +
                                str(i)] = col_need[0] + "_" + x

        self.df.drop(col_need, axis=1, inplace=True)

        df_label_code = pd.DataFrame(array_label_code,
                                     columns=col_need,
                                     index=self.df.index)
        self.df = pd.concat([self.df, df_label_code], axis=1)

    def standard_scale_map(self, col_need, drop_origin_col=False):
        self.standard_scale = StandardScaler()
        array_standard = self.standard_scale.fit_transform(
            self.df.loc[:, col_need])
        self._scale_map(array=array_standard,
                        column_name=col_need,
                        suffix="_stdScale",
                        drop_origin_columns=drop_origin_col)

    def min_max_scale_map(self, col_need, drop_origin_col=False):
        self.min_max_scale = MinMaxScaler()
        array_min_max = self.min_max_scale.fit_transform(self.df.loc[:,
                                                                     col_need])
        self._scale_map(array=array_min_max,
                        column_name=col_need,
                        suffix="_minMaxScale",
                        drop_origin_columns=drop_origin_col)

    def max_abs_scale_map(self, col_need, drop_origin_col=False):
        self.max_abs_scale = MaxAbsScaler()
        array_max_abs = self.max_abs_scale.fit_transform(self.df.loc[:,
                                                                     col_need])
        self._scale_map(array=array_max_abs,
                        column_name=col_need,
                        suffix="_maxAbsScale",
                        drop_origin_columns=drop_origin_col)

    def robust_scale_map(self,
                         col_need,
                         quantile_range=(25, 75),
                         drop_origin_col=False):
        """
        This Scaler removes the median and scales the data according to
        the quantile range (defaults to IQR: Interquartile Range).
        The IQR is the range between the 1st quartile (25th quantile)
        and the 3rd quartile (75th quantile).
        :param col_need:
        :param quantile_range:
        :param drop_origin_col:
        :return:
        """
        self.robust_scale = RobustScaler(quantile_range=quantile_range)
        array_robust = self.robust_scale.fit_transform(self.df.loc[:,
                                                                   col_need])
        self._scale_map(array=array_robust,
                        column_name=col_need,
                        suffix="_robust_scale",
                        drop_origin_columns=drop_origin_col)

    def quantile_scale_map(self,
                           col_need,
                           distribution='uniform',
                           drop_origin_col=False):
        """

        :param col_need:
        :param distribution: 'uniform' (default) or 'normal'
        :param drop_origin_col:
        :return:
        """
        self.quantile_transform = QuantileTransformer(
            output_distribution=distribution)

        array_quantile = self.quantile_transform.fit_transform(
            self.df.loc[:, col_need])
        self._scale_map(array=array_quantile,
                        column_name=col_need,
                        suffix="_q{}Map".format(distribution.capitalize()),
                        drop_origin_columns=drop_origin_col)

    def _scale_map(self,
                   array,
                   column_name,
                   suffix,
                   drop_origin_columns=False):
        if drop_origin_columns:
            self.df.drop(column_name, axis=1, inplace=True)

        col = [col + suffix for col in column_name]
        df_scale = pd.DataFrame(array, columns=col, index=self.df.index)
        self.df = pd.concat([self.df, df_scale], axis=1)

    def quantile_floor_map(self, col_need, floor_num=5, drop_origin_col=False):
        """
        after quantile_scale_map when distribution='uniform', value is scaled in [0, 1]
        for tree models, onehot encoding is need
        :param col_need:
        :param floor_num: uniform floor map
        :param drop_origin_col
        :return:
        """
        bool0 = (self.df.loc[:, col_need] >= 0) & (self.df.loc[:, col_need] <=
                                                   1)
        assert bool0.all().all()
        col_suffix = np.array([x.endswith("_qUniformMap") for x in col_need])
        assert np.prod(col_suffix)

        array_quantile_floor = (self.df.loc[:, col_need].values *
                                floor_num).astype(np.int)
        self._scale_map(array=array_quantile_floor,
                        column_name=col_need,
                        suffix="_qFloorMap",
                        drop_origin_columns=drop_origin_col)
예제 #12
0
    # on the distribution of values being transformed
    # Documentation at https://seaborn.pydata.org/generated/seaborn.distplot.html

    #this is the list of transformations to perform on the data
    distributions = [
            ('Unscaled data', X, 'Unscaled'),
            ('Data after standard scaling',
             StandardScaler().fit_transform(X), 'standard'),
             ('Data after min-max scaling',
              MinMaxScaler().fit_transform(X), 'min_max'),
              ('Data after max-abs scaling',
               MaxAbsScaler().fit_transform(X),'max_abs'),
               ('Data after robust scaling',
                RobustScaler(quantile_range=(25, 75)).fit_transform(X),'robust'),
                ('Data after quantile transformation (uniform pdf)',
                 QuantileTransformer(output_distribution='uniform')
                 .fit_transform(X),'quantile'),
                ('Data after quantile transformation (gaussian pdf)',
                 QuantileTransformer(output_distribution='normal')
                 .fit_transform(X),'gaussian'),
                ('Data after sample-wise L2 normalizing',
                 Normalizer().fit_transform(X),'normalizing'),
                ('Data after Log + 1', np.log(X+1), 'Log + 1'),
                ]

    #define function to produce the graphic
    def create_plts(x):
        temp = distributions[x][2]
        temp, ax = plt.subplots()
        sns.distplot(distributions[x][1]).set_title(distributions[x][0])
        temp.savefig(cols + distributions[x][0] + '.pdf', 
                     bbox_inches = 'tight', dpi=None, facecolor='w', edgecolor='b', 
예제 #13
0
 def __init__(self, nQuantile=None):
     self.nQuantile = Separator_num.nQUANTILE if nQuantile is None else nQuantile
     Pipeline.__init__(self, [ ('geometry' , Separator_num.Selector())
                             , ('quantiled', QuantileTransformer(n_quantiles=self.nQuantile, copy=False))  #use in-place scaling
                             ])
        return x


Scalers = {
    'Unscaled data':
    LinearScaler(),
    'Standard scaling':
    StandardScaler(),
    'Min-max scaling':
    MinMaxScaler(),
    'Max-abs scaling':
    MaxAbsScaler(),
    'Robust scaling':
    RobustScaler(quantile_range=(25, 75)),
    'Quantile transformation (uniform pdf)':
    QuantileTransformer(output_distribution='uniform'),
    'Quantile transformation (gaussian pdf)':
    QuantileTransformer(output_distribution='normal'),
    'Sample-wise L2 normalizing':
    Normalizer(norm='l2'),
    'Sample-wise L1 normalizing':
    Normalizer(norm='l1'),
}


def create_axes(title, figsize=(16, 6)):
    fig = plt.figure(figsize=figsize)
    fig.suptitle(title)

    # define the axis for the first plot
    left, width = 0.1, 0.22
예제 #15
0
# Author: Franz Weidmann
# Info: Creates for each hosts an SVM one class classifier to retrieve the normal
# state of the host. All models will be trained and save into a npy file

import numpy as np
from sklearn import svm
from sklearn.externals import joblib
from sklearn.preprocessing.data import QuantileTransformer

trainData = np.load("../../data/data.npy")

# transform data for scaling and save the state of the transformer for each host
scalers = []
for h in range(trainData.shape[0]):
    _scaler = QuantileTransformer()
    trainData[h] = _scaler.fit_transform(trainData[h])
    scalers.append(_scaler)

# train and svm one class classifier for every host
models = []
for modelIndex in range(trainData.shape[0]):
    print("Creating model ", modelIndex)
    model = svm.OneClassSVM(kernel="rbf", verbose=True)
    model.fit(trainData[modelIndex])
    models.append(model)
    print("Trained model ", modelIndex)

joblib.dump([scalers, models], "models.pkl")