def fit(self, y: np.ndarray) -> "ClassifierLabelEncoder": """Fit the estimator to the target y. For all targets, this transforms classes into ordinal numbers. If the loss function is categorical_crossentropy, the target will be one-hot encoded. Parameters ---------- y : np.ndarray The target data to be transformed. Returns ------- ClassifierLabelEncoder A reference to the current instance of ClassifierLabelEncoder. """ target_type = self._type_of_target(y) keras_dtype = np.dtype(tf.keras.backend.floatx()) self._y_shape = y.shape encoders = { "binary": make_pipeline( TargetReshaper(), OrdinalEncoder(dtype=keras_dtype, categories=self.categories), ), "multiclass": make_pipeline( TargetReshaper(), OrdinalEncoder(dtype=keras_dtype, categories=self.categories), ), "multiclass-multioutput": FunctionTransformer(), "multilabel-indicator": FunctionTransformer(), } if is_categorical_crossentropy(self.loss): encoders["multiclass"] = make_pipeline( TargetReshaper(), OneHotEncoder(sparse=False, dtype=keras_dtype, categories=self.categories), ) if target_type not in encoders: raise ValueError( f"Unknown label type: {target_type}." "\n\nTo implement support, subclass KerasClassifier and override" " ``target_encoder`` with a transformer that supports this" " label type." "\n\nFor information on sklearn target types, see:" " * https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html" " * https://scikit-learn.org/stable/modules/multiclass.html" "\n\nFor information on the SciKeras data transformation interface, see:" " * https://scikeras.readthedocs.io/en/latest/advanced.html#data-transformers" ) self._final_encoder = encoders[target_type].fit(y) if (target_type == "multilabel-indicator" and y.min() == 0 and (y.sum(axis=1) == 1).all()): target_type = "multiclass-onehot" self.n_outputs_ = 1 self.n_outputs_expected_ = 1 self._y_dtype = y.dtype self._target_type = target_type if target_type in ("binary", "multiclass"): self.classes_ = self._final_encoder[1].categories_[0] self.n_classes_ = self.classes_.size elif target_type in ("multiclass-onehot", "multilabel-indicator"): self.classes_ = np.arange(0, y.shape[1]) self.n_classes_ = y.shape[1] elif target_type == "multiclass-multioutput": self.classes_ = None self.n_classes_ = None return self
def __init__(self, cast_type=None): self.transformer_ = FunctionTransformer( feature_cast, kw_args={"cast_type": cast_type}, validate=False)
def __init__(self): self.transformer_ = FunctionTransformer(to_dense, validate=False)
from tpot.builtins import StackingEstimator from xgboost import XGBClassifier from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) imputer = Imputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Score on the training set was:0.9182509505703423 exported_pipeline = make_pipeline( make_union(FastICA(tol=0.75), FunctionTransformer(copy)), XGBClassifier(learning_rate=0.01, max_depth=4, min_child_weight=7, n_estimators=100, nthread=1, subsample=0.6500000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def calc_returns(closes): log_prices = FunctionTransformer(func=np.log).fit_transform(closes) returns = pd.DataFrame(log_prices).diff() returns.columns = closes.columns returns = returns.drop(returns.index[0]) return returns
def train(): # 1. read jsons into a list data = load_data() # 2. clean data # can not do this inside the pipeline as doing so # might get rid of the input data during inference # if during the inference only part of data is received # this is to filter out some really bad responses, e.g. # 'instagram': {'num_users': 1, 'users': [{'username': '******'}]} for i in data: if i['instagram']['num_users'] > 0: if i['instagram']['users'][0].get('followers_count', -1) < 0: data.remove(i) # split the pipeline into two: preprocess and model def json_to_df(x): return pd.DataFrame(x) # cant pickle lambdas def fill_na(x): return x.fillna(-1) preprocess_pipe = Pipeline([ ('restructure_jsons', FunctionTransformer(restructure_data)), ('jsons_to_df', FunctionTransformer(json_to_df)), # not using imputer as it casts to array # fill with -1 to separate missing case from 0 ('fill_na', FunctionTransformer(fill_na)), ]) def cast_to_float32(x): return x.values.astype('float32') def vae_output_format(x): return \ np.array(x)\ .reshape(x.shape[0], -1)\ .mean(axis=1)\ .__sub__(1)\ .clip(-1, 1)\ .round(2) # float64 is due to the fact that float32 is not json serialisable # https://github.com/tensorflow/tensorboard/issues/3057 def final_format(x): return np.atleast_1d(x).astype('float64').round(2) model_pipe = Pipeline([ ('ECDF', CustomECDF()), # ('prep_for_VAE', FunctionTransformer(cast_to_float32)), # ('VAE', VAE_numpy()), # ('vae_output_format', FunctionTransformer(vae_output_format)) ('CustomScoreCombination', CustomScoreCombination()), ('final_format', FunctionTransformer(final_format)) ]) # output train_data = preprocess_pipe.transform(data) model_pipe.fit(train_data) # scores = model_pipe.transform(train_data) # print(pd.concat([train_data, scores], axis=1)) # temp = pd.concat([train_data, pd.Series(scores.ravel())], axis=1) # temp.sort_values(0).round(2) # save # make sure to change the settings as follows # otherwise importing the saved files will be hard # https://github.com/uqfoundation/dill/issues/126 dill.settings['recurse'] = True # to be able to save and load the models with no error # have to transform at least once before saving # otherwise it throws the following error: # _function_transformer.py - KeyError: '__builtins__' model_pipe.transform(train_data) print('Saving') preprocess_pipe_path = 'web_score/scorers/preprocess_pipe.pkl' model_pipe_path = 'web_score/scorers/model_pipe.pkl' with open(preprocess_pipe_path, 'wb') as i, open(model_pipe_path, 'wb') as j: dill.dump(preprocess_pipe, i) dill.dump(model_pipe, j)
def column_transformer(name): return FunctionTransformer(partial(pd.DataFrame.__getitem__, key=name), validate=False)
from sklearn.preprocessing import FunctionTransformer # Get the dummy encoding of the labels dummy_labels = pd.get_dummies(df[LABELS]) # Get the columns that are features in the original df NON_LABELS = [c for c in df.columns if c not in LABELS] # Split into training and test sets X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS], dummy_labels, 0.2, seed=123) # Preprocess the text data: get_text_data get_text_data = FunctionTransformer(combine_text_columns, validate=False) # Preprocess the numeric data: get_numeric_data get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False) # Complete the pipeline: pl pl = Pipeline([ ('union', FeatureUnion( transformer_list=[('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features', Pipeline([('selector', get_text_data
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import FunctionTransformer # data directory DATA_hmda_acs = 'data/hmda_acs_merged.csv' DATA_zip_tract = 'data/zip_tract_122017.xlsx' DATA_shp = 'data/2019/tl_2019_53_tract.shp' DATA_zipcodes = 'data/zipcodes_king.csv' MODEL_lr_nh = 'data/lr_model_nh.sav' MODEL_lr_hf = 'data/lr_model_hf.sav' # dictionary of models model_dict = {'Original': MODEL_lr_nh, 'Without Population Bias': MODEL_lr_hf} # transform the feature vectors transformer = FunctionTransformer(np.log1p, validate=True) scaler = MinMaxScaler(feature_range=(0.2, 0.8)) # set the title of web app st.title('intelliRefinder') st.markdown( '''Predict optimal locations for mortgage refinance business opportunities using machine learning algorithms on US OpenStreetMap (OSM) data. ''') # load zip codes of king county WA zipcodes = pd.read_csv(DATA_zipcodes)['zip'] #algorithms = ('Logistic Regression', 'Random Forest') interventions = ('Original', 'Without Population Bias') # seting up the sidebar and loading the data
Jc.mobileReady = Jc.mobileReady.astype(int) Jc.personalized = Jc.personalized.astype(int) #Preparing data for modelling X = Jc.loc[:, [ 'hasCreative', 'mobileReady', 'percentOfList', 'personalized', 'trans' ]] y = Jc.readRatePercent #preparing data for pipeline from sklearn.preprocessing import FunctionTransformer # trans -text data getTrans = FunctionTransformer(lambda x: x['trans'], validate=False) # Numerics getNums = FunctionTransformer(lambda x: x[ ['hasCreative', 'mobileReady', 'percentOfList', 'personalized']], validate=False) from sklearn.pipeline import FeatureUnion from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline union = FeatureUnion( transformer_list=[('numerics', Pipeline([('selector', getNums)])), ('text', Pipeline([('selector', getTrans), ('vectorizer',
def __init__(self): # make a transformer which will load the time series and compute the # connectome matrix self.transformer_fmri = make_pipeline( FunctionTransformer(func=_load_fmri, validate=False), ConnectivityMeasure(kind='tangent', vectorize=False))
from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.533205704365034 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), make_union(FunctionTransformer(copy), FunctionTransformer(copy))), SelectPercentile(score_func=f_regression, percentile=89), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=SGDRegressor(alpha=0.0, eta0=0.01, fit_intercept=True, l1_ratio=1.0, learning_rate="constant", loss="squared_loss", penalty="elasticnet", power_t=50.0)), StackingEstimator(estimator=RidgeCV()), MaxAbsScaler(), MaxAbsScaler(), LinearSVR(C=0.5, dual=True, epsilon=1.0,
def on_field(f: str, *vec) -> Pipeline: '''Quite a mistery here: ''' return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)
import numpy as np np.warnings.filterwarnings('ignore') #for supressing warnings from numpy from sklearn.model_selection import train_test_split, cross_val_score from sklearn.linear_model import LinearRegression from sklearn.preprocessing import FunctionTransformer data = np.loadtxt('winequality-red.csv', skiprows=1, delimiter=';') #reading data file into a np array #saperating features from input data x = np.concatenate((data[:, 0:10], data[:, 11].reshape(-1, 1)), axis=1) #adding square root of each feature as a new feature as it is improving accuracy of model funt = FunctionTransformer(np.sqrt) x = np.concatenate((x, funt.fit_transform(x)), axis=1) #separating result variable from input data y = data[:, 10] #splitting data into train and test samples and training linear regression model x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) linear_r = LinearRegression(normalize=True) linear_r.fit(x_train, y_train) cross_val_mean = cross_val_score(linear_r, x_train, y_train, scoring='neg_mean_squared_error', cv=5) print('Predicted AC vs Actual AC') for x, y in zip(linear_r.predict(x_test), y_test):
def on_field(self, f: str, *vec): return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)
Reshape the model into a 3D array to fit the RNN Model. """ def shape_model_data(X, n_timesteps, n_features): return X.reshape((X.shape[0], n_timesteps, n_features)) # In[8]: """ Define preprocessing steps. """ pipeline = Pipeline([('scaler', StandardScaler()), ('reshape', FunctionTransformer(shape_model_data, kw_args=dict(n_timesteps=Tx, n_features=N_FEATURES))) ]) # In[10]: """ Transform the feature data. """ model_X_train = pipeline.fit_transform(X_train) model_X_test = pipeline.fit_transform(X_test) # In[11]: from keras.layers import Input, LSTM, BatchNormalization, Dense from keras import Model
le.classes_ le.fit_transform(['b', 'b', 'a', 'c']) le.inverse_transform([0, 0, 1, 2, 2]) from sklearn.preprocessing import Binarizer X = [[ 1., -1., 2.], [ 2., 0., 0.], [ 0., 1., -1.]] binarizer = Binarizer() binarizer.fit(X) binarizer.transform(X) binarizer = Binarizer(threshold=1.1) binarizer.transform(X) from sklearn.feature_extraction import DictVectorizer v = DictVectorizer(sparse=False) D = [{'foo':1, 'bar':2}, {'foo':3, 'baz':1}] X = v.fit_transform(D) X v.feature_names_ v.inverse_transform(X) v.transform({'foo':4, 'unseen_feature':3}) from sklearn.preprocessing import FunctionTransformer def all_b(x): return(x[:, 1:]) x = np.arange(12).reshape(4,3) func = FunctionTransformer(all_b) func.fit_transform(x)
# Besure that the horizontal grid coordinates of both the source and # grid cubes must have contiguous bounds. besure_cube_has_continuous_bounds(param_cube) besure_cube_has_continuous_bounds(target_cube) # Use the given scheme to regrid drv_cube = param_cube.regrid(target_cube, regrid_scheme) return drv_cube topo_tgt = empty_3d_cube_tgt( surface_alt_tgt_data, 'surface_altitude', 'm') topo_src = empty_3d_cube_src( surface_alt_src_data, 'surface_altitude', 'm') lsm_tgt = empty_3d_cube_tgt( lsm_tgt_data, 'land_area_fraction', '1', ) lsm_src = empty_3d_cube_src( lsm_src_data, 'land_area_fraction', '1') t_scn_src = empty_3d_cube_src( t_scn_src_data, 'air_temperature', 'K') dpt_scn_src = empty_3d_cube_src( dpt_scn_src_data, 'dew_point_temperature', 'K') sfc_prs_src = empty_3d_cube_src( sfc_prs_src_data, 'air_pressure_at_sea_level', 'Pa') X = t_scn.data transformer = FunctionTransformer(interpolate_by_scipy_linear) y = transformer.transform(X)
print(dst_target.shape) dst_target = raw_target # restore raw_target plot_iris_projection(x_index=0, y_index=1) dst_data = Imputer().fit_transform( vstack((array([nan, nan, nan, nan]), raw_data[:149]))) ax = plt.subplot(2, 4, 2 + 4) ax.set_title('Imputer()') plot_iris_projection(x_index=0, y_index=2) dst_data = PolynomialFeatures().fit_transform(raw_data) ax = plt.subplot(2, 4, 3 + 4) ax.set_title('PolynomialFeatures()') plot_iris_projection(x_index=0, y_index=3) dst_data = FunctionTransformer(log1p).fit_transform(raw_data) ax = plt.subplot(2, 4, 4 + 4) ax.set_title('FunctionTransformer()') plot_iris_projection(x_index=1, y_index=2) elif method_reg == "feature-select": dst_data = StandardScaler().fit_transform(raw_data) # variance selection method # parameter threshold is the threshold of variance dst_data = VarianceThreshold(threshold=3).fit_transform(raw_data) ax = plt.subplot(2, 4, 1 + 4) ax.set_title('VarianceThreshold()') plot_iris_projection(x_index=0, y_index=0) print(dst_data.shape) # Chi-square test dst_data = SelectKBest(chi2, k=2).fit_transform(raw_data, raw_target)
# normalizing features after nmf truncated = b.get_nmf(tfidf, r_nmf) truncated = preprocessing.scale(truncated, with_mean = False) km = a.k_means_cluster(truncated, k) if print_result: result = a.get_result(km, labels) a.print_result(result) colors = map(lambda(x): all_colors[x], km.labels_) first = pl.subplot(334) first.set_title('normalize festures using nmf') pl.scatter(truncated[:, 0:1], truncated[:, 1:2], c = colors) # using non-linear transformation non_linear = b.get_nmf(tfidf, r_nmf) non_linear = FunctionTransformer(np.log1p).transform(non_linear) km = a.k_means_cluster(non_linear, k) if print_result: result = a.get_result(km, labels) a.print_result(result) colors = map(lambda(x): all_colors[x], km.labels_) first = pl.subplot(335) first.set_title('non-linear') pl.scatter(truncated[:, 0:1], truncated[:, 1:2], c = colors) # using normalize first and then non-linear truncated = b.get_nmf(tfidf, r_lsi) truncated = preprocessing.scale(truncated, with_mean = False) b_3_first = FunctionTransformer(np.log1p).transform(truncated) km = a.k_means_cluster(b_3_first, k) if print_result:
def col2dict(): return FunctionTransformer( lambda x: pd.DataFrame(x).to_dict(orient='records'), validate=False)
return np.array([math.sqrt(len(t)) for t in x]).reshape(-1, 1) # *********Features Pipeline******* pipeline = Pipeline([ ('features_union', FeatureUnion([ ('ngrams_feature', Pipeline([ ('ngrams_vect', TfidfVectorizer(binary=False, ngram_range=(1, 2))), ])), ('length', Pipeline([ ('count', FunctionTransformer(get_text_length, validate=False)), ])) ])), # ], #transformer_weights= {'words_feature': 1, 'ngrams_feature': 1, } ('normalization', Normalizer(copy=False)), ('classifier', LinearSVC(penalty='l2')) ]) # *********Applying preprocessing******* reviews = compile(reviews) #reviews = normalization(reviews) #x_train,x_val,y_train,y_val = train_test_split(compile(reviews), target, train_size = 0.75, random_state = 42) # x_train = get_stemmed_text(x_train,'Porter') # x_val = get_stemmed_text(x_val,'Porter')
def calc_log_prices(closes): log_prices = FunctionTransformer(func=np.log).fit_transform(closes) log_df = pd.DataFrame(log_prices) log_df.index = closes.index log_df.columns = closes.columns return log_df
from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( Nystroem(gamma=10.0, kernel="polynomial", n_components=10), make_union( VotingClassifier([("est", KNeighborsClassifier(n_neighbors=4, weights="distance"))]), FunctionTransformer(lambda X: X)), make_union( VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)), FeatureAgglomeration(affinity="precomputed", linkage="average"), GaussianNB()) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
def test_function_transformer_frame(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame(np.random.randn(100, 10)) transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc')
import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.8129780700079303 exported_pipeline = make_pipeline( make_union(StackingEstimator(estimator=GaussianNB()), FunctionTransformer(copy)), ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.4, min_samples_leaf=3, min_samples_split=2, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def __init__(self, impute_val=None): self.transformer_ = FunctionTransformer( impute_null, kw_args={"impute_val": impute_val}, validate=False)
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf from sklearn.preprocessing import FunctionTransformer from sklearn.pipeline import make_pipeline, make_union, Pipeline from sklearn.feature_extraction import DictVectorizer from operator import itemgetter import pandas as pd class Vectorizer(): def __init__(self): self.vectorizer = None def on_field(self, f: str, *vec): return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec) def to_records(self, df: pd.DataFrame): return df.to_dict(orient='records') def tfidf_vectorizer(self, title_feat=100000, description_feat=500000) self.vectorizer = make_union( self.on_field("title", Tfidf(max_features=title_feat, token_pattern="\w+")), self.on_field("description", Tfidf(max_features=description_feat, token_pattern="\w+", ngram_range=(1, 2))), self.on_field(['shipping', 'status'], FunctionTransformer(self.to_records, validate=False), DictVectorizer()) ) return self.vectorizer
import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8228571428571427 exported_pipeline = make_pipeline( make_union(VarianceThreshold(threshold=0.4), FunctionTransformer(copy)), StandardScaler(), SelectPercentile(score_func=f_classif, percentile=70), LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=1e-05)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
BIAS_MAX_DF = 0.60 # Max occurance for words to be bias words params = { 'counts__binary': [True, False], 'model__max_epochs': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100], 'model__lr': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 'model__batch_size': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100], 'model__module__n_hidden': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 100], 'model__callbacks__lr_sched__patience': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] } pipeline = Pipeline([ ('counts', TfidfVectorizer(max_features=MAX_VOCAB, binary=False)), ('dense', FunctionTransformer(lambda x: x.toarray(), validate=False, accept_sparse=True)), ('model', WeightedNeuralNet(module=MLP, device='cuda', callbacks=[ ('epoch_score', callbacks.EpochScoring(scoring='f1', lower_is_better=False, name='valid_f1')), ('lr_sched', callbacks.LRScheduler(policy='ReduceLROnPlateau', monitor='valid_f1', patience=3)), ('early_stop', callbacks.EarlyStopping(monitor='valid_f1',