def log_transform(X_train, X_valid, X_test, columns): t = FunctionTransformer(np.log1p) part_X_train = t.transform(X_train[:, columns]) part_X_train = t.transform(X_train[:, columns]) part_X_valid = t.transform(X_valid[:, columns]) part_X_test = t.transform(X_test[:, columns]) X_train[:, columns] = part_X_train X_valid[:, columns] = part_X_valid X_test[:, columns] = part_X_test return X_train, X_valid, X_test
def preprocess(data): num = [ 'variable2', 'variable3', 'variable8', 'variable11', 'variable14', 'variable15', 'variable17', 'variable19' ] label = training.classLabel train = training.drop('classLabel', axis=1) train[num] = preprocessing.scale(train[num]) transformer = FunctionTransformer(np.log1p, validate=True) transformer.transform(train[num]) train[num] = preprocessing.normalize(train[num], norm='l2') return train, label
def prepare_data(input_filename, label_column, train_size, test_size, add_log_vars): df = pd.read_csv(input_filename, delimiter=',', index_col=False, header=0) data = df.values column_names = np.char.array(df.columns.values) print 'Number of columns in data {}'.format(len(column_names)) # Extract features/labels and their names from raw data. Don't include the column next # to label, since it's gender features = data[:, 0:label_column-1] labels = data[:, label_column].astype(int) feature_names = column_names[0:label_column-1] label_name = column_names[label_column] class_values = list(set(labels)) class_values.sort() train_features, test_features, train_labels, test_labels = ( model_selection.train_test_split(features, labels, test_size=test_size)) # create requested train size. train_features, train_labels = undersample(train_features, train_labels, train_size) # Impute the data and replace missing values imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False) imputer.fit(train_features) train_features = imputer.transform(train_features) test_features = imputer.transform(test_features) # Only after imputing nans, get list of columns with negative values, so we won't apply # log-transformation on them if add_log_vars: column_mins = np.amin(np.concatenate((train_features, test_features), axis=0), axis=0) pos_feature_names = feature_names[column_mins>=0] neg_feature_names = feature_names[column_mins<0] pos_train_features = train_features[:,column_mins>=0] pos_test_features = test_features[:,column_mins>=0] # make sure negative features are only skewness related assert all(['skewness' in feature for feature in neg_feature_names]) # add logof(plus-one) version to features transformer = FunctionTransformer(np.log1p) log_pos_train_features = transformer.transform(pos_train_features) log_pos_test_features = transformer.transform(pos_test_features) log_pos_feature_names = pos_feature_names + "_log" train_features = np.concatenate((train_features, log_pos_train_features), axis=1) test_features = np.concatenate((test_features, log_pos_test_features), axis=1) feature_names = np.concatenate((feature_names, log_pos_feature_names)) print 'Number of columns in data after adding log vars {}'.format(len(feature_names)) return (train_features, train_labels, test_features, test_labels, class_values, feature_names, label_name)
class FunctionTransformerPreprocessor(object): """Processor that drops the first column, and returns log of features.""" def _drop_first_feature(self, data): return data[:, 1:] def __init__(self): self._log_transformer = FunctionTransformer(np.log1p) self._drop_first_feature = FunctionTransformer( self._drop_first_feature) def preprocess(self, instances): return self._log_transformer.transform( self._drop_first_feature.transform(instances))
def process_text(self, text: tf.Tensor) -> tf.Tensor: # Convert tensor to a single document corpus = '' for doc_index in range(self.args.number_of_periods): corpus += text[doc_index].numpy().decode('utf-8', 'ignore') # Get the word counts vectorizer = CountVectorizer() word_counts = vectorizer.fit_transform([corpus]) # Apply the Log1P transformation transformer = FunctionTransformer(np.log1p) log1p_features = transformer.transform(word_counts.toarray())[0] # Get the word names in the right order and get rid of all digit sequences/numbers documents_words = [ feature.lower() for feature in vectorizer.get_feature_names() if feature.isnumeric() is False and any( char.isdigit() for char in feature) is False and feature in self.dict ] output = [0.] * len(self.dict) for word_index, word in enumerate(documents_words): output[self.dict.index(word)] = log1p_features[word_index] return output
class LogLGBM(LGBMRegressor): def __init__(self, target=None, **kwargs): super().__init__(**kwargs) if target == "Oil_norm": self.target_scaler = PowerTransformer(method='box-cox', standardize=False) elif target == 'Gas_norm': self.target_scaler = FunctionTransformer(func=np.log1p, inverse_func=np.expm1) elif target == 'Water_norm': self.target_scaler = FunctionTransformer(func=np.log1p, inverse_func=np.expm1) def fit(self, X, Y, **kwargs): # y_train = np.log1p(Y) self.target_scaler.fit(Y.values.reshape(-1, 1) + 1) y_train = pd.Series( self.target_scaler.transform(Y.values.reshape(-1, 1) + 1).reshape( -1, )) super(LogLGBM, self).fit(X, y_train, **kwargs) return self def predict(self, X): preds = super(LogLGBM, self).predict(X).reshape(-1, 1) preds = self.target_scaler.inverse_transform(preds) - 1 return preds[:, 0]
class DataTransfomer: """A class to transform data based on user-defined function to get predicted outcomes. This class calls FunctionTransformer of scikit-learn internally (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html).""" def __init__(self, func=None, kw_args=None): self.func = func self.kw_args = kw_args def feed_data_params(self, data_interface): if self.kw_args is not None: self.kw_args['data_interface'] = data_interface else: self.kw_args = {'data_interface': data_interface} def initialize_transform_func(self): if self.func == 'ohe-min-max': self.data_transformer = FunctionTransformer(func=ohe_min_max_transformation, kw_args=self.kw_args, validate=False) elif self.func is None: # identity transformation # add more ready-to-use transformers (such as label-encoding) in elif loops. self.data_transformer = FunctionTransformer(func=self.func, kw_args=None, validate=False) else: # add more ready-to-use transformers (such as label-encoding) in elif loops. self.data_transformer = FunctionTransformer(func=self.func, kw_args=self.kw_args, validate=False) def transform(self, data): return self.data_transformer.transform(data) # should return a numpy array def inverse_transform(self, data): return self.data_transformer.inverse_transform(data) # should return a numpy array
class FunctionTransformerPrim(primitive): def __init__(self, random_state=0): super(FunctionTransformerPrim, self).__init__(name='FunctionTransformer') self.id = 11 self.hyperparams = [] self.type = 'feature preprocess' self.description = "Constructs a transformer from an arbitrary callable. A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc." self.hyperparams_run = {'default': True} self.scaler = FunctionTransformer() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) # Update return True def fit(self, data): data = handle_data(data) self.scaler.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) cols = ["{}_qntl".format(x) for x in cols] output['X'] = pd.DataFrame(self.scaler.transform(output['X']), columns=cols) final_output = {0: output} return final_output
class TensorScaler(TransformerMixin): """Scaling for 3D tensors. Assumes the size is (..., length, input_channels), reshapes to (..., input_channels), performs the method operation and then reshapes back. Arguments: method (str): Scaling method, one of ('stdsc', 'ma', 'mms'). scaling_function (transformer): Specification of an sklearn transformer that performs a scaling operation. Only one of this or scaling can be specified. """ def __init__(self, method="stdsc", scaling_function=None): self.scaling = method if all([method is None, scaling_function is None]): self.scaler = FunctionTransformer(func=None) elif isinstance(method, str): self.scaler = SCALERS.get(method)() assert ( self.scaler is not None), "Scalings allowed are {}, recieved {}.".format( SCALERS.keys(), method) else: self.scaler = scaling_function @apply_fit_to_channels def fit(self, data, labels=None): self.scaler.fit(data) return self @apply_transform_to_channels def transform(self, data): output_data = torch.Tensor(self.scaler.transform(data)) return output_data
def logarithmic_regression(input_data, cement, water, coarse_aggr, fine_aggr, days): variables = input_data.iloc[:, :-1] results = input_data.iloc[:, -1] n = results.shape[0] results = results.values.reshape( n, 1 ) #reshaping the values so that variables and results have the same shape #transforming x data to logarithmic fucntion log_regression = FunctionTransformer(np.log, validate=True) log_variables = log_regression.fit_transform(variables) #making linear model and fitting the logarithmic data into linear model regression = linear_model.LinearRegression() model = regression.fit(log_variables, results) input_values = [cement, water, coarse_aggr, fine_aggr, days] #transforming input data for prediction in logarithmic function input_values = log_regression.transform([input_values]) #predicting the outcome based on the input_values predicted_strength = regression.predict( input_values) #adding values for prediction predicted_strength = round(predicted_strength[0, 0], 2) return "Logarithmic prediction: " + str(predicted_strength)
class DistanceTransformer: """Transforms the raw distances to the appropriate modeling form """ def __init__(self, pos_features, pipeline_obj_path): """ Args: pos_features: list of positional features to use pipeline_obj_path: path to the serialized pipeline obj_path """ self.pos_features = pos_features self.pipeline_obj_path = pipeline_obj_path # deserialize the pickle file with open(self.pipeline_obj_path, "rb") as f: pipeline_obj = pickle.load(f) self.POS_FEATURES = pipeline_obj[0] self.minmax_scaler = pipeline_obj[1] self.imp = pipeline_obj[2] self.funct_transform = FunctionTransformer(func=sign_log_func, inverse_func=sign_log_func_inverse) # for simplicity, assume all current pos_features are the # same as from before assert self.POS_FEATURES == self.pos_features def transform(self, x): # impute missing values and rescale the distances xnew = self.minmax_scaler.transform(self.funct_transform.transform(self.imp.transform(x))) # convert distances to spline bases dist = {"dist_" + k: encodeSplines(xnew[:, i, np.newaxis], start=0, end=1, warn=False) for i, k in enumerate(self.POS_FEATURES)} return dist
def log_trans(self): self._data_init() transformer = FunctionTransformer(np.log1p) X = self.data.values y = self.label.values X = self.data_array = transformer.transform(X) sio.savemat("clean_data/" + self.dataset, {'X': X, 'y': y})
def test_kw_arg(): X = np.linspace(0, 1, num=10).reshape((5, 2)) F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) # Test that rounding is correct assert_array_equal(F.transform(X), np.around(X, decimals=3))
def transform_to_depth_pct(self, data): """ transform_to_volume takes in a dataframe like diamonds and returns an np.ndarray consisting of the approximate depth percentage of each diamond. :Example: >>> diamonds = sns.load_dataset('diamonds').drop(columns='depth') >>> out = TransformDiamonds(diamonds) >>> transformed = out.transform_to_depth_pct(diamonds) >>> len(transformed.shape) == 1 True >>> np.isclose(transformed[0], 61.286, atol=0.0001) True """ # Custom function to calc depth percentage def depth_pct(arrs): depth_pct = [] for arr in arrs: x, y, z = arr[0], arr[1], arr[2] depth_pct.append(100 * z / ((x + y) / 2)) return np.array(depth_pct) trans = FunctionTransformer(depth_pct, validate=True) transformed = trans.transform(data[['x', 'y', 'z']].values) return transformed
def log_trans(data): """ :param data: :return: """ transformer = FunctionTransformer(np.log1p) data = transformer.transform(data) return data
def q4(): # Retorne aqui o resultado da questão 4. transformer = FunctionTransformer(np.log1p) df = get_sample(athletes, 'weight', n=3000) df = transformer.transform(df) (k2,pvalue) = sct.normaltest(df) return bool(pvalue>=0.05)
def Scaler(X_train): transformer = FunctionTransformer(np.log1p, validate=True) X_train = transformer.transform(X_train) scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(X_train) return scaler.transform(X_train)
def test_inverse_transform(): X = np.array([1, 4, 9, 16]).reshape((2, 2)) # Test that inverse_transform works correctly F = FunctionTransformer(func=np.sqrt, inverse_func=np.around, inv_kw_args=dict(decimals=3)) testing.assert_array_equal(F.inverse_transform(F.transform(X)), np.around(np.sqrt(X), decimals=3))
def load_tensor_data(fileloc): """ Helper function to load the actors data, filter by criterias of 1 million min. revenue and actors in at least 20 movies. Returns actor matrix and logNormal revenue as torch tensors. """ data_actors = pd.read_csv(fileloc, index_col=0) X = data_actors.iloc[:, 2:] X_data = torch.Tensor(X.to_numpy(dtype='float32')) transformer = FunctionTransformer(np.log1p, validate=True) data_actors["log_revenue"] = transformer.transform( data_actors["revenue"].values.reshape(-1, 1)) Y_data = torch.Tensor(data_actors["log_revenue"].to_numpy().reshape( X.shape[0], 1)) cols_keep = ['Judi Dench', 'Cobie Smulders'] cols_20 = ['title_x', 'revenue', 'log_revenue'] for col in data_actors.columns[2:-1]: if col in cols_keep: continue elif np.sum(data_actors[col]) >= 20: cols_20.append(col) data_million = data_actors[cols_20 + cols_keep] must_keep = data_million[(data_million["Judi Dench"] == 1) | (data_million["Cobie Smulders"] == 1)] data_million = data_million[data_million["revenue"] > 1000000] # X_all = data_million[ # data_million.columns.difference( # ['title_x', 'revenue', 'log_revenue'] # ) # ].append(must_keep[must_keep.columns.difference( # ['title_x', 'revenue', 'log_revenue'])], ignore_index = True) X_all = data_million[data_million.columns.difference( ['revenue', 'log_revenue'])].append( must_keep[must_keep.columns.difference(['revenue', 'log_revenue'])], ignore_index=True) y_all = data_million['revenue'].append(must_keep['revenue']) x_train = X_all y_train = y_all x_train_tensors = torch.tensor( x_train.drop("title_x", axis=1).to_numpy(dtype='float32')) y_train_tensors = torch.tensor(y_all.to_numpy(dtype='float32')) cols = list(x_train.columns) cols = [cols[-1]] + cols[:-1] x_train = x_train[cols] return x_train_tensors, y_train_tensors, x_train.columns, x_train
def test_inverse_transform(): X = np.array([1, 4, 9, 16]).reshape((2, 2)) # Test that inverse_transform works correctly F = FunctionTransformer( func=np.sqrt, inverse_func=np.around, inv_kw_args=dict(decimals=3)) testing.assert_array_equal( F.inverse_transform(F.transform(X)), np.around(np.sqrt(X), decimals=3))
def exponential_transformation(data): transformer = FunctionTransformer(np.exp1p, validate=True) for column in data.columns: if column not in config.CATEGORICALS: data[column] = transformer.transform(data[column]) return data
def log_transformation(data): transformer = FunctionTransformer(np.log1p, validate=True) for column in data.columns: temp = data[column].values.reshape(-1, 1) data[column] = transformer.transform(temp + 1) return data
def combine_attr_adder(housing): attr_adder = FunctionTransformer(add_extra_features, validate=False, kw_args={"add_bedrooms_per_room": False}) housing_extra_attribs = attr_adder.transform(housing.values) housing_extra_attribs = pd.DataFrame( housing_extra_attribs, columns=list(housing.columns) + ["rooms_per_household", "population_per_household"]) # print(housing_extra_attribs.head()) return housing_extra_attribs
def test_function_transformer(self): x = numpy.array([[6.1, -5], [3.5, -7.8]], dtype=numpy.float32) tr = FunctionTransformer(custom_fct) tr.fit(x) y_exp = tr.transform(x) self.assertEqualArray( numpy.array([[6.1, 0.], [3.5, 0.]], dtype=numpy.float32), y_exp) onnx_model = to_onnx(tr, x) oinf = OnnxInference(onnx_model) y_onx = oinf.run({'X': x}) self.assertEqualArray(y_exp, y_onx['variable'])
def test_function_transformer_pickle(self): x = numpy.array([[6.1, -5], [3.5, -7.8]], dtype=numpy.float32) tr = FunctionTransformer(custom_fct) tr.fit(x) y_exp = tr.transform(x) st = BytesIO() # import cloudpickle as pkl pkl = pickle pkl.dump(tr, st) cp = BytesIO(st.getvalue()) tr2 = pkl.load(cp) y_exp2 = tr2.transform(x) self.assertEqualArray(y_exp, y_exp2)
def test_function_transformer_fft_abs(self): for rt, fct in [('py', custom_fft_abs), ('ort', custom_fft_abs_ort)]: with self.subTest(runtime=rt): x = numpy.array([[6.1, -5], [3.5, -7.8]], dtype=numpy.float32) tr = FunctionTransformer(fct) tr.fit(x) y_exp = tr.transform(x) onnx_model = to_onnx(tr, x) oinf = OnnxInference(onnx_model) y_onx = oinf.run({'X': x}) self.assertEqualArray(y_exp, y_onx['variable'], decimal=5)
class FunctionExtractor(TransformerMixin): def __init__(self, func, result_column, source_column=None, validate=False): self.func = func self.source_column = source_column self.result_column = result_column self.validate = validate self.extractor = FunctionTransformer(self.func, validate=self.validate) def fit(self, X, y=None): return self def transform(self, X): if self.source_column is not None: X[self.result_column] = self.extractor.transform( X[self.source_column]) else: X[self.result_column] = self.extractor.transform(X) return X
def prepare_data(df): import numpy as np from sklearn.preprocessing import FunctionTransformer sc = FunctionTransformer(np.log1p) X = df[['goal']] X = sc.transform(X) df[['goal']] = X df = pd.get_dummies(df, columns=['country']) df = pd.get_dummies(df, columns=['category']) df = pd.get_dummies(df, columns=['deadline_weekday']) df = pd.get_dummies(df, columns=['created_at_weekday']) df = pd.get_dummies(df, columns=['launched_at_weekday']) return df
def scale_data(df, p, train=True, save=True): if p.log_scale: df.loc[df["last_pend_time"] == 0, "last_pend_time"] = 1 if train: log_scaler = FunctionTransformer(np.log2) df.loc[:, ["last_pend_time"]] = log_scaler.fit_transform( df[["last_pend_time"]]) if save: joblib.dump(log_scaler, "log_scaler.save") else: log_scaler = joblib.load("log_scaler.save") df.loc[:, ["last_pend_time"]] = log_scaler.transform( df[["last_pend_time"]]) scale_cols = ["last_pend_time"] if p.use_using_cores: scale_cols.append("using_cores") if p.use_spending_run_time: scale_cols.append("spending_run_time") if p.use_pending_jobs: scale_cols.append("pending_jobs") if p.use_last_pend_time_submit: scale_cols.append("last_pend_time_submit") if p.use_submit_time: scale_cols.append("sin_submit_time") scale_cols.append("cos_submit_time") if p.use_day_of_week: scale_cols.append("sin_day_of_week") scale_cols.append("cos_day_of_week") if train: min_max_scaler = MinMaxScaler(feature_range=(0, 1)) df.loc[:, scale_cols] = min_max_scaler.fit_transform(df[scale_cols]) if save: joblib.dump(min_max_scaler, "min_max_scaler.save") else: min_max_scaler = joblib.load("min_max_scaler.save") df.loc[:, scale_cols] = min_max_scaler.transform(df[scale_cols]) if p.standard_scale: if train: standard_scaler = StandardScaler() df.loc[:, scale_cols] = standard_scaler.fit_transform(df[scale_cols]) if save: joblib.dump(standard_scaler, "standard_scaler.save") else: standard_scaler = joblib.load("standard_scaler.save") df.loc[:, scale_cols] = standard_scaler.transform(df[scale_cols]) return df
def test_functiontransformer_vs_sklearn(): # Compare msmbuilder.preprocessing.FunctionTransformer # with sklearn.preprocessing.FunctionTransformer functiontransformerr = FunctionTransformerR() functiontransformerr.fit(np.concatenate(trajs)) functiontransformer = FunctionTransformer() functiontransformer.fit(trajs) y_ref1 = functiontransformerr.transform(trajs[0]) y1 = functiontransformer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
class DFFunctionTransformer(TransformerMixin): # FunctionTransformer but for pandas DataFrames def __init__(self, *args, **kwargs): self.ft = FunctionTransformer(*args, **kwargs) def fit(self, X, y=None): # stateless transformer return self def transform(self, X): Xt = self.ft.transform(X) Xt = pd.DataFrame(Xt, index=X.index, columns=X.columns) return Xt
# # dataframe slicing # selectionlist gets passed the parameterlist from json object selectionlist = [] selectionlist.extend((args.list)) # read data, df1=pd.read_table('penalties.csv', sep=';',header=0) # all headers colnames = list(df1.columns.values) # slice data X=df1.ix[:,selectionlist] # sqrt transform the heavily skewed data transformer = FunctionTransformer(np.sqrt) Xtran = transformer.transform(X) X = pd.DataFrame(Xtran) selectionheaders = selectionlist oldnames = X.columns.values # rename all columns with original columnheaders X.rename(columns=dict(zip(oldnames, selectionheaders)), inplace=True) # rest indizes colnamesrest = [x for x in colnames if x not in selectionlist] Rest = df1.ix[:, colnamesrest] # deletes multiplier columns del Rest['multiplier'] #plot 3by3 scatterplotmatrix from pandas.tools.plotting import scatter_matrix scatter_matrix(X, alpha=0.2, figsize=(3, 3)) plt.show()
from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.layers.advanced_activations import PReLU from keras.utils import np_utils, generic_utils from sklearn.preprocessing import FunctionTransformer #read data train_tour1 = pd.read_csv('numerai_training_data.csv') feature = pd.DataFrame(train_tour1.ix[:,0:21]) target = pd.DataFrame(train_tour1.target) #log feature transformer = FunctionTransformer(np.log1p) feature_log = transformer.transform(feature) #add all feature feature_log = pd.DataFrame(feature_log) feature_all = pd.concat([feature, feature_log], axis =1 ) #separate target and features feature_all = np.asarray(feature_all) target = np.asarray(target) # convert list of labels to binary class matrix target = np_utils.to_categorical(target) # pre-processing: divide by max and substract mean scale = np.max(feature_all) feature_all /= scale