def _test_binarizer_converter(self, threshold): warnings.filterwarnings("ignore") X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32) # Create SKL model for testing model = Binarizer(threshold=threshold) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn(model, initial_types=[ ("float_input", FloatTensorType_onnx(X.shape)) ]) # Create ONNX model by calling converter onnx_model = convert(onnx_ml_model, "onnx", X) # Get the predictions for the ONNX-ML model session = ort.InferenceSession(onnx_ml_model.SerializeToString()) output_names = [ session.get_outputs()[i].name for i in range(len(session.get_outputs())) ] inputs = {session.get_inputs()[0].name: X} onnx_ml_pred = session.run(output_names, inputs)[0] # Get the predictions for the ONNX model onnx_pred = onnx_model.transform(X) return onnx_ml_pred, onnx_pred
def use_Binarizer(): x = [[1., -1, 2.], [2., 0., 0.], [0., 1., -1.]] scaler = Binarizer() scaler.fit(x) # 필요없음. print(scaler.transform(x)) scaler = Binarizer(threshold=1.5) print(scaler.transform(x)) # Binarizer 단순 버전 print(preprocessing.binarize(x))
def test_onnx_binarizer_converter_raises_rt(self): warnings.filterwarnings("ignore") X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32) model = Binarizer(threshold=0) model.fit(X) # generate test input onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))]) onnx_ml_model.graph.node[0].attribute[0].name = "".encode() self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
def test_onnxrt_python_Binarizer(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) clr = Binarizer() clr.fit(X_train, y_train) model_def = to_onnx(clr, X_train.astype(numpy.float32)) oinf = OnnxInference(model_def) got = oinf.run({'X': X_test}) self.assertEqual(list(sorted(got)), ['variable']) exp = clr.transform(X_test) self.assertEqualArray(exp, got['variable'], decimal=6)
def test_binarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Binarizer # with sklearn.preprocessing.Binarizer binarizerr = BinarizerR() binarizerr.fit(np.concatenate(trajs)) binarizer = Binarizer() binarizer.fit(trajs) y_ref1 = binarizerr.transform(trajs[0]) y1 = binarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
class BinarizerImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_binarizer(self): data = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]], dtype=np.float64) model = Binarizer(threshold=0.5) model.fit(data) model_onnx = convert_sklearn(model, "scikit-learn binarizer", [("input", DoubleTensorType(data.shape))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnBinarizerDouble-SkipDim1")
def test_binarizer_converter(self): data = np.array([[1, 2, -3], [4, -3, 0], [0, 1, 4], [0, -5, 6]], dtype=np.float32) data_tensor = torch.from_numpy(data) for threshold in [0.0, 1.0, -2.0]: model = Binarizer(threshold=threshold) model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(torch_model) np.testing.assert_allclose( model.transform(data), torch_model.transform(data_tensor), rtol=1e-06, atol=1e-06, )
def test_default(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = BinarizerComponent() config = self.get_default(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) expected = Binarizer() expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == feature_names assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def ge_transform(df_GE, genes): scaler = MinMaxScaler() print(len(df_GE)) binarizer = Binarizer(threshold=threshold_binarize) df_features = df_GE.transpose() print(len(df_features)) df_features = df_features.groupby(df_features.columns, axis=1).agg(max) df_features = df_features[genes] scaler.fit(df_features) df_features = scaler.transform(df_features) binarizer.fit(df_features) df_features = binarizer.transform(df_features) print(len(df_features)) df_features = pd.DataFrame(df_features) df_features.columns = genes return df_features
class BinarizerTransformation(AbstractPreProcessor): _binarizer = None threshold = 0.0 copy = True def fit_transform(self, data, y=None): self.fit(data, y) return self.transform(data, y) def transform(self, data, y=None): data = self._check_input(data) output = self._binarizer.transform(data) output = self._check_output(data, output) return output def fit(self, data, y=None): self._binarizer = Binarizer(threshold=self.threshold, copy=self.copy) self._binarizer.fit(data)
def Binz(df, target): # split into X and y datasets X_init = df.drop(target, axis=1) y_init = df[target] dum = Binarizer() scaled = RobScale(df) print('Binarizer fitting...') fit = dum.fit(scaled) print('Binarizer transforming...') dfit = pd.DataFrame(fit.transform(scaled)) # drop any NaNs that may have been made (there were few in the landslides vectorization) dfity = pd.concat([dfit, y_init], axis=1, join_axes=[y_init.index]).dropna() print('The encoded data has shape:',dfity.shape,'\n\n') return dfity
indicator.features_ X1 indicator.transform(X1) X2 indicator.transform(X2) indicator_all = MissingIndicator(features='all') indicator_all.fit_transform(X1) indicator_all.fit_transform(X2) indicator_all.features_ from sklearn.preprocessing import Binarizer X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]] transformer = Binarizer() type(transformer) transformer.fit(X) transformer.transform(X) from sklearn.preprocessing import MinMaxScaler data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] scaler = MinMaxScaler() scaler.fit(data) scaler.data_max_ scaler.min_ scaler.scale_ scaler.data_min_ scaler.data_max_ scaler.data_range_ scaler.transform(data)
# -*- coding: utf-8 -*- """ Created on Tue May 23 14:26:05 2017 @author: 凯风 """ from sklearn.preprocessing import Binarizer import numpy as np X_train = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) X_test = np.array([[1, 1, 1], [2, 4, 2], [3, 4, 3]]) ''' 数据的二值化: 适用模型:泊松分布、文本处理 操作特点:返回对于数值特征的阈值判断 ''' bina = Binarizer(threshold=0.5, copy=True) bina.fit(X_train) bina.transform(X_train) ''' threshold ——阈值 '''
import numpy as np from sklearn.preprocessing import Binarizer x = np.random.normal(100,12,20) binobj = Binarizer(threshold=100) bi = binobj.fit_transform(x) print(bi) import pandas as pd df = pd.read_excel('C:/Users/e.almaee/Desktop/Dataset/Heart data.xlsx') binobj = Binarizer(threshold=220) df = df.dropna(subset=['cholestoral ']) bi = binobj.fit(np.asarray(df['cholestoral ']).reshape(-1,1)) df['cholestoral '] = bi.transform(np.asarray(df['cholestoral ']).reshape(-1,1))
from sklearn import preprocessing import numpy as np folder = 'group_3re/' encoder = Binarizer() final_data = np.array([]) size = 2 for i in range(0, size, 1): filename = str(i) + '.png' img = cv2.imread(folder + filename, cv2.IMREAD_GRAYSCALE) np.savetxt('img_0.txt', img) scaler = preprocessing.StandardScaler().fit(img) X_scaled = scaler.transform(img) np.savetxt('x_sc.txt', X_scaled) encoder.fit(X_scaled) img = encoder.transform(X_scaled) np.savetxt('img_re.txt', img, fmt='%i') final_data = np.append(final_data, img) print(i, 'done') print('fin=', final_data) print(final_data.shape) final_data = final_data.reshape(size, 6400) print('re_fin= ', final_data) print(final_data.shape) savemat('final_data.mat', {'random': final_data}) np.savetxt('final_data.txt', final_data, fmt='%i')
import seaborn as sns sns.set(style="darkgrid") ax = sns.countplot(x="CODE_GENDER", data=train) cols2bin_train = [ c for c in train.columns if c.startswith("FLAG_") and len(train[c].value_counts(dropna=False)) == 2 ] cols2bin_test = [ c for c in test.columns if c.startswith("FLAG_") and len(test[c].value_counts(dropna=False)) == 2 ] for c in cols2bin_train: binarizer = Binarizer() binarizer.fit(train[c].values.reshape(-1, 1)) train[c] = binarizer.transform(train[c].values.reshape(-1, 1)) test[c] = binarizer.transform(test[c].values.reshape(-1, 1)) flag_train = [c for c in train.columns if c.startswith("FLAG_")] for c in flag_train[2:]: train.loc[:, c] = train[c].astype('bool') test.loc[:, c] = test[c].astype('bool') for c in flag_train: print(c) print(train[c].unique()) le = LabelEncoder() le.fit(train.FLAG_OWN_CAR) train.FLAG_OWN_CAR = le.transform(train.FLAG_OWN_CAR)
le.fit([1, 2, 2, 6]) le.classes_ le.transform([1, 1, 2, 6]) le.inverse_transform([0, 0, 2, 1]) le.fit(['a', 'a', 'b', 'b', 'c']) le.classes_ le.fit_transform(['b', 'b', 'a', 'c']) le.inverse_transform([0, 0, 1, 2, 2]) from sklearn.preprocessing import Binarizer X = [[ 1., -1., 2.], [ 2., 0., 0.], [ 0., 1., -1.]] binarizer = Binarizer() binarizer.fit(X) binarizer.transform(X) binarizer = Binarizer(threshold=1.1) binarizer.transform(X) from sklearn.feature_extraction import DictVectorizer v = DictVectorizer(sparse=False) D = [{'foo':1, 'bar':2}, {'foo':3, 'baz':1}] X = v.fit_transform(D) X v.feature_names_ v.inverse_transform(X) v.transform({'foo':4, 'unseen_feature':3}) from sklearn.preprocessing import FunctionTransformer def all_b(x):
import numpy as np X_data = np.array([[0.3, 0.6], [0.7, 0.5]]) from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=0.5) binarizer.fit(X_data) Binarizer(copy=True, threshold=0.5) X_data = binarizer.transform(X_data) print(X_data)
def binarizer_(self,X,thre): bin = Binarizer(threshold=0.0) bin.fit(X) return bin.transform(X)
def binarizer_usecase(): X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]) binarizer = Binarizer(threshold=1.1) binarizer.fit(X_train) print(binarizer.transform(X_train))
data_normalized = DNorm.transform(array) set_printoptions(precision=2) print("normalized data with l1: \n", data_normalized[0:3]) #L2 normalization L2DNorm = Normalizer(norm='l2') L2DNorm.fit(array) L2data_normalized = L2DNorm.transform(array) set_printoptions(precision=2) print("normalized data with l2: \n", L2data_normalized[0:3]) # Binarization from sklearn.preprocessing import Binarizer my_binarizer = Binarizer(threshold=0.5) my_binarizer.fit(array) binarized_data = my_binarizer.transform(array) print("our binarized data is: \n", binarized_data[0:3]) # Standarization from sklearn.preprocessing import StandardScaler DScal = StandardScaler() DScal.fit(array) data_standarted = DScal.transform(array) set_printoptions(precision=2) print("our standarized data is: \n", data_standarted[0:3]) # Labelling Encoding from sklearn.preprocessing import LabelEncoder
pd_age = pd.DataFrame(age, columns=['age']) #%%分箱处理 '''1、监督分箱''' '''1-1 pd.cut()相当于为成绩设置优、良、差的有序分箱,可看作一种等量分箱''' bins = [0, 20, 50, 100] #设置间隔 pd.cut(pd_age['age'], bins, labels=['A', 'B', 'C']) #自己指定划分区间 pd.cut(pd_age['age'], 4, labels=['A', 'B', 'C', 'D'], retbins=True) #指定bins的数量N,实现等量均分 '''1-2 pd.qcut()也是一种有序的等频分箱''' pd.qcut(pd_age['age'], 4, retbins=True, labels=['A', 'B', 'C', 'D']) #指定划分组数实现等频分箱 #retbins返还划分区间,若标识为True,则返还两部分内容,一部分是Series,另一部分为numpy.array '''1-3 sklearn实现二分类分箱''' from sklearn.preprocessing import Binarizer box = Binarizer(threshold=60) #threshold是二分类的划分界限 box.fit(age) box.fit_transform(age) '''1-4 sklearn实现多分类分箱''' from sklearn.preprocessing import KBinsDiscretizer Kbox = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans') #encode中onehot-dense返回密集数组,onehot返回稀疏矩阵,ordinal返回一列 #strategy中quantile表示等频分箱,uniform表示等量分箱,kmeans表示最接近中心点的分箱 Kbox.fit(age) Kbox.fit_transform(age) Kbox.bin_edges_ #查看分类边界 '''2、有监督分箱''' '''2-1 卡方分箱''' from scipy.stats import chi2 chi2.cdf(10, 3) #输入:卡方值、自由度;输出:(1-P)值 chi2.sf(10, 3) #输入:卡方值、自由度;输出:P值 chi2.ppf(0.9814338645369568, 3) #输入:(1-P)值、自由度;输出:卡方值
import numpy as np X_data = np.array([[0.3, 0.6], [0.7, 0.5]]) #Binarizer:二值化工具,将数据分为0和1,其只需要传入一个参数threshold(阈值),其小于或等于该值的数据会变为0,大于该值的数据会变为1。 from sklearn.preprocessing import Binarizer #导入库 binarizer = Binarizer(threshold=0.5) #实例化 binarizer.fit(X_data) #fit #Binarizer(copy=True, threshold=0.5) 这表示的是一个输出 X_data = binarizer.transform(X_data) #transform print(X_data)
from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=314) print x_train.size, x_test.size from sklearn.preprocessing import Binarizer import numpy as np binary = Binarizer(np.mean(x_train[:, 0])) print binary binary.fit(x_train[:, 0]) print binary.transform(x_train[:, 0]) print np.mean(binary.transform( x_train[:, 0])) ## What percent of train is above the mean? print np.mean(binary.transform(x_test[:, 0])) ## Transform X_Test # What happens if I refit binary? binary = Binarizer(np.mean(x_test[:, 0])) binary.fit(x_test[:, 0]) print np.mean(binary.transform(x_test[:, 0])) print np.mean(x_train[:, 0]), np.mean(x_test[:, 0])