def data(): data = pd.read_csv('facies_vectors.csv') feature_names = [ 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS' ] data = data.fillna(data['PE'].mean()) train, test = train_test_split(data, test_size=0.3) X_train = train[feature_names].values y_train = train['Facies'].values X_test = test[feature_names].values y_test = test['Facies'].values well_train = train['Well Name'].values well_test = test['Well Name'].values depth_train = train['Depth'].values depth_test = test['Depth'].values robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_train) X_train_robust = robust.transform(X_train) X_test_robust = robust.transform(X_test) scaler = preprocessing.RobustScaler( quantile_range=(25.0, 75.0)).fit(X_train_robust) X_train = scaler.transform(X_train_robust) X_test = scaler.transform(X_test_robust) Y_train = to_categorical(y_train, 10) Y_test = to_categorical(y_test, 10) return X_train, Y_train, X_test, Y_test
def init_pp(ppi, raw_data): # Initialize list of scaler objects if ppi['name'] == 'MinMax': pp = [preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)), # temp preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0))] # humid. elif ppi['name'] == 'MaxAbs': pp = [preprocessing.MaxAbsScaler(), # for temperature preprocessing.MaxAbsScaler()] # and humidity elif ppi['name'] == 'StandardScaler': pp = [preprocessing.StandardScaler(), # for temperature preprocessing.StandardScaler()] # and humidity elif ppi['name'] == 'RobustScaler': pp = [preprocessing.RobustScaler(), # for temperature preprocessing.RobustScaler()] # and humidity elif ppi['name'] == 'SimpleY': pp = [10./1., 10./2.5] # for temperature else: ValueError('Incorrect scaler name') # Initialize scalers with data if ppi['method'] == 'individually': pp[0].fit(unpack(raw_data, 'T')) pp[1].fit(unpack(raw_data, 'q')) elif ppi['method'] == 'alltogether': pp[0].fit(np.reshape(unpack(raw_data, 'T'), (-1, 1))) pp[1].fit(np.reshape(unpack(raw_data, 'q'), (-1, 1))) elif ppi['method'] == 'qTindividually': if ppi['name'] != 'SimpleY': pp = pp[0] pp.fit(raw_data) else: raise ValueError('Incorrect scaler method') return pp
def compute_offset(data_df, ref_df): # d_scaler = preprocessing.StandardScaler(with_std=False) d_scaler = preprocessing.RobustScaler() r_scaler = preprocessing.RobustScaler() d_scaler.fit(data_df) r_scaler.fit(ref_df) return r_scaler.center_ - d_scaler.center_
def load_data_prediction(self): self.x_pre = np.loadtxt('x_pre.dat') self.x_range = np.loadtxt('x_range.dat') for i in range(len(self.x_pre)): if self.x_pre[i] < self.x_range[i][0] or self.x_pre[ i] > self.x_range[i][1]: print('The structure is out of range, go to QM calulation') # raise IOError print(self.x_pre.ndim) if self.x_pre.ndim == 0: self.n_pre = 1 if self.x_pre.ndim == 1: if self.n_x_dim == 1: self.n_pre = self.x_pre.shape[0] else: self.n_pre = 1 if self.x_pre.ndim > 1: self.n_pre = self.x_pre.shape[0] if self.n_pre == 1 and self.n_x_dim == 1: self.x_pre = self.x_pre.reshape(1, 1) if self.n_pre == 1 and self.n_x_dim != 1: self.x_pre = self.x_pre.reshape(1, -1) if self.n_pre != 1 and self.n_x_dim == 1: self.x_pre = self.x_pre.reshape(self.n_pre, 1) self.x_pre_old = self.x_pre if self.rescale != "NO": if self.rescale == "Normal": scale_xdata = preprocessing.StandardScaler() scale_xdata.fit(self.x_train) self.scale_x_factor = scale_xdata.scale_ self.mean_x = scale_xdata.mean elif self.rescale == "Robust": scale_xdata = preprocessing.RobustScaler() scale_xdata.fit(self.x_train) self.scale_x_factor = scale_xdata.scale_ self.mean_x = scale_xdata.center_ x_train_scale = scale_xdata.transform(self.x_train) self.x_train = x_train_scale x_pre_scale = scale_xdata.transform(self.x_pre) self.x_pre = x_pre_scale # Scale y scale_ydata = preprocessing.RobustScaler() scale_ydata.fit(self.y_train) self.scale_y_factor = scale_ydata.scale_ self.mean_y = scale_ydata.center_
def gendata(doPCA=False): data = pd.read_csv('LengthOfStay.csv') #Save the length of stay in a different variable labels = data['lengthofstay'] # Drop columns that we dont need like specific dates, or the id of the patient data = data.drop(["eid", "vdate", "discharged", "lengthofstay"], axis=1) # Add dummy encoding for the object and type variables # For example, turn gender column into 2 columns, where a male will be 1 in the first column # and a 0 in the second column, and a female will be the inverse data = pd.get_dummies(data, columns=['rcount']) data = pd.get_dummies(data, columns=['gender']) data = pd.get_dummies(data, columns=['facid']) if not doPCA: hematocrit = data[['hematocrit']].values data['hematocrit'] = preprocessing.StandardScaler().fit_transform( hematocrit) bloodureanitro = data[['neutrophils']].values data['neutrophils'] = preprocessing.RobustScaler().fit_transform( bloodureanitro) sodium = data[['sodium']].values data['sodium'] = preprocessing.StandardScaler().fit_transform(sodium) glucose = data[['glucose']].values data['glucose'] = preprocessing.StandardScaler().fit_transform(glucose) bloodureanitro = data[['bloodureanitro']].values data['bloodureanitro'] = preprocessing.RobustScaler().fit_transform( bloodureanitro) creatinine = data[['creatinine']].values data['creatinine'] = preprocessing.StandardScaler().fit_transform( creatinine) bmi = data[['bmi']].values data['bmi'] = preprocessing.StandardScaler().fit_transform(bmi) pulse = data[['pulse']].values data['pulse'] = preprocessing.StandardScaler().fit_transform(pulse) respiration = data[['respiration']].values data['respiration'] = preprocessing.StandardScaler().fit_transform( respiration) # Seperate for train and test train_X = data.head(n=80000).to_numpy() test_X = data.tail(n=20000).to_numpy() train_Y = labels.head(n=80000).to_numpy() test_Y = labels.tail(n=20000).to_numpy() return train_X, test_X, train_Y, test_Y
def dataProcess(): data = pd.read_csv("//Users/tsukeka/Downloads/LengthOfStay.csv") data.drop(columns=["eid", "vdate", "discharged", "facid"], inplace=True) data = data.replace({'gender': {'M': 1, 'F': 0}, 'rcount': {'5+': 5}}) data = data.astype({'rcount': 'int'}) hematocrit = data[['hematocrit']].values data['hematocrit'] = preprocessing.StandardScaler().fit_transform( hematocrit) bloodureanitro = data[['neutrophils']].values data['neutrophils'] = preprocessing.RobustScaler().fit_transform( bloodureanitro) sodium = data[['sodium']].values data['sodium'] = preprocessing.StandardScaler().fit_transform(sodium) glucose = data[['glucose']].values data['glucose'] = preprocessing.StandardScaler().fit_transform(glucose) bloodureanitro = data[['bloodureanitro']].values data['bloodureanitro'] = preprocessing.RobustScaler().fit_transform( bloodureanitro) creatinine = data[['creatinine']].values data['creatinine'] = preprocessing.StandardScaler().fit_transform( creatinine) bmi = data[['bmi']].values data['bmi'] = preprocessing.StandardScaler().fit_transform(bmi) pulse = data[['pulse']].values data['pulse'] = preprocessing.StandardScaler().fit_transform(pulse) respiration = data[['respiration']].values data['respiration'] = preprocessing.StandardScaler().fit_transform( respiration) data = pd.concat( [data, pd.get_dummies(data['secondarydiagnosisnonicd9'])], axis=1) data = data.drop(columns=['secondarydiagnosisnonicd9']) labels = data['lengthofstay'] data = data.drop(columns=['lengthofstay']) pca = PCA() data = pca.fit_transform(data) train_X = np.array(data[:80000]) train_Y = labels.head(n=80000).to_numpy() test_X = np.array(data[80000:]) test_Y = labels.tail(n=20000).to_numpy() return train_X, test_X, train_Y, test_Y
def __init__(self, x, nb_epoch=770, batch_size=64, learning_rate=0.001, H1=58, H2=32, H3=19, DRP=0.1): # You can add any input parameters you need # Remember to set them with a default value for LabTS tests """ Initialise the model. Arguments: - x {pd.DataFrame} -- Raw input data of shape (batch_size, input_size), used to compute the size of the network. - nb_epoch {int} -- number of epoch to train the network. """ #Attributes to store constants to be applied on test data self.yScaler = preprocessing.RobustScaler() self.xScaler = preprocessing.RobustScaler() self.lb = preprocessing.LabelBinarizer() self.x = x if x is not None: X, _ = self._preprocessor(x, training=True) #init parameters self.loss_values = [] self.input_size = X.shape[1] self.output_size = 1 self.nb_epoch = nb_epoch self.batch_size = batch_size self.learning_rate = learning_rate self.H1 = H1 self.H2 = H2 self.H3 = H3 self.DRP = DRP self.net = Net(self.input_size, self.H1, self.H2, self.H3, self.output_size, self.DRP) return
def _process(self, data): #features = preprocessing.PolynomialFeatures().fit_transform(features) features = preprocessing.RobustScaler().fit_transform(data) #features = decomposition.TruncatedSVD().fit_transform(features) #cols = list(['f_' + i for i in range(features.shape[1])]) return pd.DataFrame(features, columns=data.columns)
def median_scaling(train_ds, val_ds, test_ds, y_col_idx): train_X, train_y = tf.data.experimental.get_single_element(train_ds) val_X, val_y = tf.data.experimental.get_single_element(val_ds) test_X, test_y = tf.data.experimental.get_single_element(test_ds) train_X, train_y = train_X.numpy(), train_y.numpy() val_X, val_y = val_X.numpy(), val_y.numpy() test_X, test_y = test_X.numpy(), test_y.numpy() from sklearn import preprocessing scaler_X = preprocessing.RobustScaler(with_centering=False, quantile_range=(0.02, 0.98)).fit( train_X.reshape( (-1, train_X.shape[-1]))) #scaler_y = preprocessing.RobustScaler(with_centering=False, quantile_range=(0.02, 0.98)).fit(train_y[:, -1, :]) #train_y[:, 0, :] = scaler_y.transform(train_y[:, 0, :]) #val_y[:, 0, :] = scaler_y.transform(val_y[:, 0, :]) #test_y[:, 0, :] = scaler_y.transform(test_y[:, 0, :]) for i in range(train_X.shape[1]): train_X[:, i, :] = scaler_X.transform(train_X[:, i, :]) val_X[:, i, :] = scaler_X.transform(val_X[:, i, :]) test_X[:, i, :] = scaler_X.transform(test_X[:, i, :]) train_ds = tf.data.Dataset.from_tensors((train_X, train_y)) val_ds = tf.data.Dataset.from_tensors((val_X, val_y)) test_ds = tf.data.Dataset.from_tensors((test_X, test_y)) return train_ds, val_ds, test_ds
def enhancement(template, query, k): if k == 1: ss = preprocessing.StandardScaler() ss.fit(template) template = ss.transform(template) query = ss.transform(query) elif k == 2: rs = preprocessing.RobustScaler() rs.fit(template) template = rs.transform(template) query = rs.transform(query) elif k == 3: mm = preprocessing.MinMaxScaler() mm.fit(template) template = mm.transform(template) query = mm.transform(query) elif k == 4: pca = PCA(n_components=5) pca.fit(template) template = pca.transform(template) query = pca.transform(query) else: print("No enhancement applied. Returning original data.") return template, query
def _preprocess_features(self): """ Standardize and normalize features. """ if self.standardize: if self.scaler is None: # define a new scaler if none is given self.scaler = preprocessing.RobustScaler() # store all features into numpy array and fit standardizer X = np.vstack([ self.features[item].reshape(1, -1) for item in self.items ]) self.scaler.fit(X) for item in self.features: x = self.features[item].reshape(1, -1) if self.standardize: x = self.scaler.transform(x) if self.normalize is not None: x = preprocessing.normalize(x, norm=self.normalize) self.features[item] = x.flatten()
def scale_columns(dataframe, columns, scaler_name="RobustScaler"): """ Apply a data normalization method to the specified columns of a Pandas dataframe. :param dataframe: Pandas dataframe. :param columns: String containing the columns name separated by comma. :param scaler_name: String containing the name of the scaler method (default="RobustScaler"). :return: Pandas dataframe and the scaler object. """ import pandas as pd from sklearn import preprocessing scaler = None if scaler_name == "StandardScaler": scaler = preprocessing.StandardScaler() if scaler_name == "RobustScaler": scaler = preprocessing.RobustScaler() if scaler_name == "MinMaxScaler": scaler = preprocessing.MinMaxScaler() if scaler_name == "Normalizer": scaler = preprocessing.Normalizer() assert scaler is not None data = dataframe.filter(columns, axis=1) print(scaler.fit(data)) scaled_data = scaler.transform(data) scaled_df = pd.DataFrame(scaled_data, columns=columns) dataframe_scaled = dataframe.copy() dataframe_scaled = dataframe_scaled.reset_index(drop=True) for column in columns: dataframe_scaled[column] = scaled_df[column] return dataframe_scaled, scaler
def preprocess(preprocesstype, var): #preprocesstype: selects preproccesing type for model, "MMS" for MinMaxScaler, "RS" for Robustscaler, "SS" for StandardScaler, "MAS" for MaxAbsScaler #var for varibale np.array is set to from sklearn import preprocessing if preprocesstype == "MMS": print("preprocessing is done with MinMaxScaler") X = preprocessing.StandardScaler() var = X.fit_transform(var) return var elif preprocesstype == "RS": print("preprocessing is done with RobustScaler") X = preprocessing.RobustScaler() var = X.fit_transform(var) return var elif preprocesstype == "SS": print("preprocessing is done with StandardScaler") X = preprocessing.StandardScaler() var = X.fit_transform(var) return var elif preprocesstype == "MAS": print("preprocessing is done with MaxAbsScaler") X = preprocessing.MaxAbsScaler() var = X.fit_transform(var) return var else: print("Preprocessing type not recognized")
def get_scaler(scale_method='StandardScaler'): """ Get different kinds of scalers from scikit-learn :param scale_method: scale method :returns: scaler instance :raises: none """ scaler = None if scale_method == 'StandardScaler': scaler = preprocessing.StandardScaler() elif scale_method == 'MinMaxScaler': scaler = preprocessing.MinMaxScaler() elif scale_method == 'MaxAbsScaler': scaler = preprocessing.MaxAbsScaler() elif scale_method == 'RobustScaler': scaler = preprocessing.RobustScaler() elif scale_method == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer() elif scale_method == 'Normalizer': scaler = preprocessing.Normalizer() elif scale_method == 'PowerTransformer': scaler = preprocessing.PowerTransformer() else: print(scale_method, ' not found') return scaler
def keras_mlp3(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(512, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(256, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(32, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.SGD(nesterov=True)) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def data(): data = pd.read_csv('facies_vectors.csv') data['Facies'] -= 1 feature_names = [ 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up', 'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up' ] data = data.fillna(data['PE'].mean()) new_data = diffind.data_with_diff(data) test = new_data[new_data['Well Name'] == 'NEWBY'] train = new_data[new_data['Well Name'] != 'NEWBY'] X_train_1 = train[feature_names].values y_train = train['Facies'].values X_test_1 = test[feature_names].values y_test = test['Facies'].values well_train = train['Well Name'].values well_test = test['Well Name'].values depth_train = train['Depth'].values depth_test = test['Depth'].values X_aug_train = augmentation.augment_features(X_train_1, well_train, depth_train) X_aug_test = augmentation.augment_features(X_test_1, well_test, depth_test) robust = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_aug_train) X_train_robust = robust.transform(X_aug_train) X_test_robust = robust.transform(X_aug_test) scaler = StandardScaler().fit(X_train_robust) X_train_robust_norm = scaler.transform(X_train_robust) X_test_robust_norm = scaler.transform(X_test_robust) X_train = X_train_robust_norm X_test = X_test_robust_norm Y_train = to_categorical(y_train, 9) Y_test = to_categorical(y_test, 9) return (X_train, Y_train, X_test, Y_test)
def test_robustScaler(self): data = np.random.normal(10, 3, size=100) data = np.array([data]).T rob_scaler = preprocessing.RobustScaler() self.scaler2dict2scaler_test(rob_scaler, data)
def plot_scalers(X1, X2): scalers = [ pp.StandardScaler(), pp.MinMaxScaler(), pp.Normalizer(), pp.RobustScaler() ] scaler_names = ['Standard', 'MinMax', 'Normalizer', 'Robust'] no_of_scalers = len(scalers) i = 1 fig = plt.figure(figsize=(6, 7)) for scaler, scaler_name in zip(scalers, scaler_names): X1 = scaler.fit_transform(X1) X2 = scaler.fit_transform(X2) ax = plt.subplot(2, 2, i) ax.scatter(X1[:, 0], X1[:, 1], c='red', marker='.') ax.scatter(X2[:, 0], X2[:, 1], c='blue', marker='.') ax.set_xlim(-3, 3) ax.set_ylim(-3, 3) ax.grid(True) ax.set_title(scaler_name) centeralise_axes(ax) i += 1 plt.show()
def standardizing(df, methods): ''' This function takes in a dataframe and a method for standardizing, it returns the standardized dataframe. The methods are: - z: for z-scores - mm: for min-max - robust: for robust - gauss: for gaussian ''' if methods == 'z': scaler = preprocessing.StandardScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) elif methods == 'mm': scaler = preprocessing.MinMaxScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) elif methods == 'robust': scaler = preprocessing.RobustScaler().fit(df) scaled_df = pd.DataFrame(scaler.transform(df)) else: scaler = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True) scaled_df = pd.DataFrame(scaler.fit_transform(df)) return scaled_df
def robust_scaler(df): ''' This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). ''' scaler = preprocessing.RobustScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) return df_scaled
def get_scaler(scaler_type=constants.SCALER): if scaler_type == constants.SCALER_TYPE_STANDARD: return preprocessing.StandardScaler() if scaler_type == constants.SCALER_TYPE_MIN_MAX: return preprocessing.MinMaxScaler() if scaler_type == constants.SCALER_TYPE_ROBUST: return preprocessing.RobustScaler()
def define_scaling(config): """Defines scaling method based on model configurations. Args: config (ConfigParser-object): object containing the parsed configuration-settings of the model. Raises: ValueError: raised if a non-supported scaling method is specified. Returns: scaler: the specified scaling method instance. """ if config.get('machine_learning', 'scaler') == 'MinMaxScaler': scaler = preprocessing.MinMaxScaler() elif config.get('machine_learning', 'scaler') == 'StandardScaler': scaler = preprocessing.StandardScaler() elif config.get('machine_learning', 'scaler') == 'RobustScaler': scaler = preprocessing.RobustScaler() elif config.get('machine_learning', 'scaler') == 'QuantileTransformer': scaler = preprocessing.QuantileTransformer() else: raise ValueError( 'no supported scaling-algorithm selected - choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer' ) if config.getboolean('general', 'verbose'): print('chosen scaling method is {}'.format(scaler)) return scaler
def pre_process_data(X_train, X_validation, X_test, scaler_type, feature_extract=True, log_scale=True): if log_scale: X_train = np.log10(X_train) X_validation = np.log10(X_validation) X_test = np.log10(X_test) #create scaler if scaler_type == 'Standard': scaler = preprocessing.StandardScaler().fit(X_train) elif scaler_type == 'Robust': scaler = preprocessing.RobustScaler().fit(X_train) # robust scaling X_train_scale = scaler.transform(X_train) X_validation_scale = scaler.transform(X_validation) X_test_scale = scaler.transform(X_test) if feature_extract == True: X_train_scale, X_validation_scale, X_test_scale = feature_extraction( X_train_scale, X_validation_scale, X_test_scale) return X_train_scale[: 100000, :], X_validation_scale[: 100000, :], X_test_scale[: 100000, :]
def robust(x_train_dum, scale_list): ''' Robust Scaler ''' rs = preprocessing.RobustScaler() x_train_rs = x_train_dum.copy() for i in scale_list: x_train_rs.iloc[:, i] = rs.fit_transform(x_train_dum.iloc[:, i]) return x_train_rs
def cluster(ano, vals, n): if (ano == 2014): data = d14 elif (ano == 2015): data = d15 elif (ano == 2016): data = d16 elif (ano == 2017): data = d17 else: return "O ano não possui dados disponíveis" feats = vals cols = feats + ['municipio', 'ano'] df_c = pd.DataFrame(data[cols]) df_c = df_c.dropna() df_c = df_c[(np.abs(stats.zscore(df_c[feats])) < 3).all(axis=1)] scaler = preprocessing.RobustScaler().fit(df_c[feats]) train = scaler.transform(df_c[feats]) kmeans = KMeans(n_clusters=n, random_state=0).fit(train) labels = kmeans.labels_ df_c['labels'] = labels return (df_c, silhouette_score(train, labels, metric='euclidean'), davies_bouldin_score(train, labels))
def normalize(trainX, testX, type=None): print(trainX.shape) print(testX.shape) Scalar = None if type == 'standard': Scalar = preprocessing.StandardScaler() elif type == 'min_max': Scalar = preprocessing.MinMaxScaler() elif type == 'l1' or type == 'l2': Scalar = preprocessing.Normalizer(norm=type) elif type == 'l2_v2': trainX = trainX / np.expand_dims(np.sqrt(np.sum(trainX ** 2, axis=1)), axis=1) testX = testX / np.expand_dims(np.sqrt(np.sum(testX ** 2, axis=1)), axis=1) elif type == 'robust': Scalar = preprocessing.RobustScaler() elif type == 'min-max': trainX = (trainX - np.min(trainX))/(np.max(trainX) - np.min(trainX)) testX = (testX - np.min(trainX))/(np.max(testX) - np.min(testX)) if Scalar is not None: trainX = Scalar.fit_transform(trainX) testX = Scalar.fit_transform(testX) return trainX, testX
def scale_periods(dict_dataframes): ddi_scaled = dict() for key, index_name in enumerate(dict_dataframes): ddi_scaled[index_name] = copy.deepcopy(dict_dataframes[index_name]) for key, index_name in enumerate(ddi_scaled): scaler = preprocessing.RobustScaler(with_centering=True) for index, value in enumerate(ddi_scaled[index_name]): X_train = ddi_scaled[index_name][value][1] X_train_scaled = scaler.fit_transform(X_train) X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=list(X_train.columns)) X_val = ddi_scaled[index_name][value][2] X_val_scaled = scaler.transform(X_val) X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=list(X_val.columns)) X_test = ddi_scaled[index_name][value][3] X_test_scaled = scaler.transform(X_test) X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=list(X_test.columns)) ddi_scaled[index_name][value][1] = X_train_scaled_df ddi_scaled[index_name][value][2] = X_val_scaled_df ddi_scaled[index_name][value][3] = X_test_scaled_df return ddi_scaled
def scale_columns(df, columns, scaler_name="RobustScaler"): from sklearn import preprocessing scaler = None if scaler_name == "StandardScaler": scaler = preprocessing.StandardScaler() if scaler_name == "RobustScaler": scaler = preprocessing.RobustScaler() if scaler_name == "MinMaxScaler": scaler = preprocessing.MinMaxScaler() if scaler_name == "Normalizer": scaler = preprocessing.Normalizer() assert scaler is not None data = df.filter(columns, axis=1) print(scaler.fit(data)) scaled_data = scaler.transform(data) scaled_df = pd.DataFrame(scaled_data, columns=columns) dataframe_scaled = df.copy() dataframe_scaled = dataframe_scaled.reset_index(drop=True) for column in columns: dataframe_scaled[column] = scaled_df[column] return dataframe_scaled, scaler
def model_train(feature, train, label, flag, labelname): #--归一化处理--- N = preprocessing.RobustScaler() scale_feature = N.fit_transform(feature) train_feature = scale_feature[:train.shape[0]] test_feature = scale_feature[train.shape[0]:] print(train_feature.shape, test_feature.shape) #----------------liner_model---------------- #--cv-5折交叉验证选取最优参数 alphas = np.logspace(-4, -1, 30) cv_lasso = [ mse_cv(linear_model.Lasso(alpha), train_feature, label).mean() for alpha in alphas ] # print(alphas) # print(cv_lasso) index = list(cv_lasso).index(min(cv_lasso)) print("=best_mse :", min(cv_lasso)) print("=best_alphas :", alphas[index]) clf = linear_model.Lasso(alphas[index]) model = clf.fit(train_feature, label) res = model.predict(test_feature) print("==模型系数:", model.coef_) test = pd.read_csv("data/test.csv") test["pred"] = res test[[labelname, "pred"]].to_csv('data/result_{}.csv'.format(flag), header=None, index=False)
def normalize_attr(x, norm='l1'): """Normalize attribute matrix with given type. Parameters ---------- x: Numpy array-like matrix norm: The specified type for the normalization. 'l1': l1-norm for axis 1, from `sklearn.preprocessing`. 'l1_0': l1-norm for axis 0, from `sklearn.preprocessing`. 'scale': standard scale for axis 0, from `sklearn.preprocessing.scale` 'robust_scale', robust scale for axis 0, from `sklearn.preprocessing.robust_scale` None: return the copy of `x` Returns ------- A normalized attribute matrix in Numpy format. """ if norm not in {'l1', 'l1_0', 'scale', 'robust_scale', None}: raise ValueError(f'{norm} is not a supported norm.') if norm == 'l1': x_norm = preprocessing.normalize(x, norm='l1', axis=1) elif norm == 'l1_0': x_norm = preprocessing.normalize(x, norm='l1', axis=0) elif norm == 'scale': # something goes wrong with type float32 x_norm = preprocessing.StandardScaler().fit(x).transform(x) elif norm == 'robust_scale': x_norm = preprocessing.RobustScaler().fit(x).transform(x) else: x_norm = x.copy() return x_norm