def test_min_max_scaler_1d(): # Test scaling of dataset along single axis rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def test_min_max_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def learn(examples, Classifier, classifierArgs, develFolds=10, verbose=3, n_jobs=1, predKey="ml_comb_pred", limitTerms=None): print "Parameter grid search" develExamples = getSubset(examples, ["devel"]) clf = GridSearchCV(Classifier(), classifierArgs, cv=develFolds, verbose=verbose, n_jobs=n_jobs, scoring="f1_micro") clf.fit(develExamples["features"], develExamples["classes"]) print "Best params", (clf.best_params_, clf.best_score_) print "Predicting all examples" minMax = MinMaxScaler((0.03, 1.0)) allPredictions = clf.predict(examples["features"]) if hasattr(clf, "predict_proba"): allProbabilities = clf.predict_proba(examples["features"]) else: allProbabilities = clf.decision_function(examples["features"]) #import pdb; pdb.set_trace() minMax.fit( allProbabilities) #minmax_scale(testProbabilities, (0.03, 1.0)) allProbabilities = minMax.transform( allProbabilities ) #allProbabilities = minmax_scale(allProbabilities, (0.03, 1.0)) print "Predicting the test set" testExamples = getSubset(examples, ["test"]) testPredictions = clf.predict(testExamples["features"]) if hasattr(clf, "predict_proba"): testProbabilities = clf.predict_proba(testExamples["features"]) else: testProbabilities = clf.decision_function(testExamples["features"]) testProbabilities = minMax.transform(testProbabilities) binaryToMultiLabel(testExamples, testPredictions, testProbabilities, predKey) print "Evaluating test set ensemble predictions" testProteins = {x["id"]: x for x in testExamples["proteins"]} multiLabelTestExamples = evaluateFile.makeExamples(testProteins, limitTerms=limitTerms, limitToSets=["test"], predKey=predKey) loading.vectorizeExamples(multiLabelTestExamples, None, sparseLabels=True) results = evaluation.evaluate(multiLabelTestExamples["labels"], multiLabelTestExamples["predictions"], multiLabelTestExamples, terms=None, averageOnly=True, noAUC=True) print "Average for test set:", evaluation.metricsToString( results["average"]) binaryToMultiLabel(examples, allPredictions, allProbabilities, predKey)
class Scaler(TransformerMixin): def __init__(self): self._scaler = MinMaxScaler(feature_range=(-1, 1)) def transform(self, df, *_): assert_all_finite(df) scaled = self._scaler.transform(df) df = pd.DataFrame(scaled, columns=df.columns) assert_all_finite(df) return df def fit(self, df, *_): self._scaler.fit(df) return self
class MinMaxScalerImpl(): def __init__(self, feature_range=(0, 1), copy=True): self._hyperparams = {'feature_range': feature_range, 'copy': copy} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
class CreateMinMaxScaler(CreateModel): def fit(self, data, args): self.model = MinMaxScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def create_scaler(): global data_source global data my_data = data[data_source] all_performances = [] for method, performances in my_data.items(): all_performances = all_performances + performances min_v = min(all_performances) max_v = max(all_performances) lower = floor(min_v) upper = ceil(max_v) if max_v < 1: upper = max_v lower = min_v scaler = MinMaxScaler(feature_range=(lower, upper)) scaler.fit(np.array(all_performances).reshape(-1, 1)) return scaler
def correct_values(values, min_value, max_value): ''' Ensures that values are in given range @param values: 1d numpy array ''' # scale # do nothing if valid values lowest_val = np.min(values) largest_val = np.max(values) lowest_val_valid = lowest_val >= min_value and lowest_val < max_value largest_val_valid = largest_val <= max_value and largest_val > min_value #print("allowed: min_val: ", min_value, " max_val: ", max_value) #print("current: min_val: ", lowest_val, "max_val: ", largest_val) if lowest_val_valid and largest_val_valid: pass else: #print("at least one not valid") # +/-1 to prevent AssertionErrors caused by rounding errors # -> +/-1 introduces new excpetion: "ValueError: Minimum of desired # feature range must be smaller than maximum. Got (84.80001171045868, # 84). -> Therefore used without +-1 and adapted assertions. min_value_for_scaler = min_value # + 1 max_value_for_scaler = max_value # - 1 # re-use max/min values in data if valid, otherwise all functions would # be in same range if lowest_val_valid: #print("lowest valid") min_value_for_scaler = lowest_val if largest_val_valid: #print("largest valid") max_value_for_scaler = largest_val scaler = MinMaxScaler(feature_range=( min_value_for_scaler, max_value_for_scaler)) reshaped_values = values.reshape(-1, 1) # otherwise DeprecationWarning scaler = scaler.fit(reshaped_values) values = scaler.transform(reshaped_values) values = np.reshape(values, len(values)) # original shape # print("afterwards: min_val: ", np.min( # values), " max_val: ", np.max(values)) min_in_scaled = np.min(values) max_in_scaled = np.max(values) # test whether min_value <= min_in_scaled assert min_value - min_in_scaled <= 0.0000001, "current min: " + \ str(min_in_scaled) + "but allowed min is: " + str(min_value) # test wheter max_in_scaled <= max_value assert max_in_scaled - max_value <= 0.000001, "current max: " + str(max_in_scaled) + \ " but allowed max is: " + str(max_value) return values
def split_train_validation_test(multi_time_series_df, valid_start_time, test_start_time, features, time_step_lag=1, horizon=1, target='target', time_format='%Y-%m-%d %H:%M:%S', freq='H'): if not isinstance(features, list) or len(features) < 1: raise Exception( "Bad input for features. It must be an array of dataframe colummns used" ) train = multi_time_series_df.copy()[ multi_time_series_df.index < valid_start_time] train = train[features] X_scaler = MinMaxScaler() if 'load' in features: y_scaler = MinMaxScaler() y_scaler.fit(train[['load']]) else: y_scaler = MinMaxScaler() tg = train[target] y_scaler.fit(tg.values.reshape(-1, 1)) train[features] = X_scaler.fit_transform(train) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} train_inputs = TimeSeriesTensor(train, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(train_inputs.dataframe.head()) look_back_dt = dt.datetime.strptime( valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1) valid = multi_time_series_df.copy()[ (multi_time_series_df.index >= look_back_dt) & (multi_time_series_df.index < test_start_time)] valid = valid[features] valid[features] = X_scaler.transform(valid) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} valid_inputs = TimeSeriesTensor(valid, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(valid_inputs.dataframe.head()) # test set # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1) test = multi_time_series_df.copy()[test_start_time:] test = test[features] test[features] = X_scaler.transform(test) test_inputs = TimeSeriesTensor(test, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print("time lag:", time_step_lag, "original_feature:", len(features)) return train_inputs, valid_inputs, test_inputs, y_scaler
import tensorflow as tf # Non-normalized inputs xy = np.array([[828.659973, 833.450012, 908100, 828.349976, 831.659973], [823.02002, 828.070007, 1828100, 821.655029, 828.070007], [819.929993, 824.400024, 1438100, 818.97998, 824.159973], [816, 820.958984, 1008100, 815.48999, 819.23999], [819.359985, 823, 1188100, 818.469971, 818.97998], [819, 823, 1198100, 816, 820.450012], [811.700012, 815.25, 1098100, 809.780029, 813.669983], [809.51001, 816.659973, 1398100, 804.539978, 809.559998]]) # Nomalization - MinMaxScaler => 0 ~ 1 from sklearn.preprocessing.data import MinMaxScaler scaler = MinMaxScaler() scaler.fit(xy) xy = scaler.transform(xy) x_data = xy[:,0:-1] y_data = xy[:,[-1]] X = tf.placeholder(tf.float32, shape=[None,4]) Y = tf.placeholder(tf.float32, shape=[None,1]) W = tf.Variable(tf.random_normal([4,1]), name='weight') b = tf.Variable(tf.random_normal([1]), name='bias') hypothesis = tf.matmul(X, W) + b cost = tf.reduce_mean(tf.square(hypothesis - Y)) # minimize optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-5)