def _test_binarizer_converter(self, threshold):
        warnings.filterwarnings("ignore")
        X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]],
                     dtype=np.float32)

        # Create SKL model for testing
        model = Binarizer(threshold=threshold)
        model.fit(X)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(model,
                                        initial_types=[
                                            ("float_input",
                                             FloatTensorType_onnx(X.shape))
                                        ])

        # Create ONNX model by calling converter
        onnx_model = convert(onnx_ml_model, "onnx", X)

        # Get the predictions for the ONNX-ML model
        session = ort.InferenceSession(onnx_ml_model.SerializeToString())
        output_names = [
            session.get_outputs()[i].name
            for i in range(len(session.get_outputs()))
        ]
        inputs = {session.get_inputs()[0].name: X}
        onnx_ml_pred = session.run(output_names, inputs)[0]

        # Get the predictions for the ONNX model
        onnx_pred = onnx_model.transform(X)

        return onnx_ml_pred, onnx_pred
def use_Binarizer():
    x = [[1., -1, 2.], [2., 0., 0.], [0., 1., -1.]]
    scaler = Binarizer()
    scaler.fit(x)  # 필요없음.
    print(scaler.transform(x))

    scaler = Binarizer(threshold=1.5)
    print(scaler.transform(x))

    # Binarizer 단순 버전
    print(preprocessing.binarize(x))
    def test_onnx_binarizer_converter_raises_rt(self):
        warnings.filterwarnings("ignore")
        X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32)
        model = Binarizer(threshold=0)
        model.fit(X)

        # generate test input
        onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))])
        onnx_ml_model.graph.node[0].attribute[0].name = "".encode()

        self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
    def test_onnxrt_python_Binarizer(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        clr = Binarizer()
        clr.fit(X_train, y_train)

        model_def = to_onnx(clr, X_train.astype(numpy.float32))
        oinf = OnnxInference(model_def)
        got = oinf.run({'X': X_test})
        self.assertEqual(list(sorted(got)), ['variable'])
        exp = clr.transform(X_test)
        self.assertEqualArray(exp, got['variable'], decimal=6)
示例#5
0
def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
示例#6
0
class BinarizerImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
def test_binarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.Binarizer
    # with sklearn.preprocessing.Binarizer

    binarizerr = BinarizerR()
    binarizerr.fit(np.concatenate(trajs))

    binarizer = Binarizer()
    binarizer.fit(trajs)

    y_ref1 = binarizerr.transform(trajs[0])
    y1 = binarizer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
 def test_binarizer(self):
     data = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]],
                     dtype=np.float64)
     model = Binarizer(threshold=0.5)
     model.fit(data)
     model_onnx = convert_sklearn(model,
                                  "scikit-learn binarizer",
                                  [("input", DoubleTensorType(data.shape))],
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(data,
                         model,
                         model_onnx,
                         basename="SklearnBinarizerDouble-SkipDim1")
    def test_binarizer_converter(self):
        data = np.array([[1, 2, -3], [4, -3, 0], [0, 1, 4], [0, -5, 6]],
                        dtype=np.float32)
        data_tensor = torch.from_numpy(data)

        for threshold in [0.0, 1.0, -2.0]:
            model = Binarizer(threshold=threshold)
            model.fit(data)

            torch_model = hummingbird.ml.convert(model, "torch")
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(
                model.transform(data),
                torch_model.transform(data_tensor),
                rtol=1e-06,
                atol=1e-06,
            )
    def test_default(self):
        X_train, X_test, y_train, y_test, feature_names = self.load_data()

        actual = BinarizerComponent()
        config = self.get_default(actual)

        actual.set_hyperparameters(config)
        actual.fit(X_train, y_train)
        X_actual = actual.transform(np.copy(X_test))

        expected = Binarizer()
        expected.fit(X_train, y_train)
        X_expected = expected.transform(X_test)

        assert actual.get_feature_names_out(feature_names).tolist() == feature_names
        assert repr(actual.estimator_) == repr(expected)
        assert np.allclose(X_actual, X_expected)
示例#11
0
文件: fs.py 项目: Roche/AMASC
def ge_transform(df_GE, genes):
    scaler = MinMaxScaler()
    print(len(df_GE))
    binarizer = Binarizer(threshold=threshold_binarize)
    df_features = df_GE.transpose()
    print(len(df_features))
    df_features = df_features.groupby(df_features.columns, axis=1).agg(max)

    df_features = df_features[genes]
    scaler.fit(df_features)
    df_features = scaler.transform(df_features)
    binarizer.fit(df_features)
    df_features = binarizer.transform(df_features)
    print(len(df_features))
    df_features = pd.DataFrame(df_features)

    df_features.columns = genes
    return df_features
示例#12
0
class BinarizerTransformation(AbstractPreProcessor):

    _binarizer = None
    threshold = 0.0
    copy = True

    def fit_transform(self, data, y=None):
        self.fit(data, y)
        return self.transform(data, y)

    def transform(self, data, y=None):
        data = self._check_input(data)
        output = self._binarizer.transform(data)
        output = self._check_output(data, output)
        return output

    def fit(self, data, y=None):
        self._binarizer = Binarizer(threshold=self.threshold, copy=self.copy)
        self._binarizer.fit(data)
示例#13
0
def Binz(df, target):
  # split into X and y datasets
  X_init = df.drop(target, axis=1)
  y_init = df[target]
  dum = Binarizer()
  scaled = RobScale(df)
  print('Binarizer fitting...')
  fit = dum.fit(scaled)
  print('Binarizer transforming...')
  dfit = pd.DataFrame(fit.transform(scaled))
  # drop any NaNs that may have been made (there were few in the landslides vectorization)
  dfity = pd.concat([dfit, y_init], axis=1, join_axes=[y_init.index]).dropna()
  print('The encoded data has shape:',dfity.shape,'\n\n')
  return dfity
indicator.features_
X1
indicator.transform(X1)
X2
indicator.transform(X2)

indicator_all = MissingIndicator(features='all')
indicator_all.fit_transform(X1)
indicator_all.fit_transform(X2)
indicator_all.features_

from sklearn.preprocessing import Binarizer
X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]
transformer = Binarizer()
type(transformer)
transformer.fit(X)

transformer.transform(X)

from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
scaler.fit(data)
scaler.data_max_
scaler.min_
scaler.scale_
scaler.data_min_
scaler.data_max_
scaler.data_range_
scaler.transform(data)
示例#15
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 23 14:26:05 2017

@author: 凯风
"""

from sklearn.preprocessing import Binarizer
import numpy as np

X_train = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]])

X_test = np.array([[1, 1, 1], [2, 4, 2], [3, 4, 3]])
'''
    数据的二值化:
        适用模型:泊松分布、文本处理
        操作特点:返回对于数值特征的阈值判断
'''

bina = Binarizer(threshold=0.5, copy=True)
bina.fit(X_train)
bina.transform(X_train)
'''
    threshold ——阈值
'''
示例#16
0
import numpy as np
from sklearn.preprocessing import Binarizer

x = np.random.normal(100,12,20)

binobj = Binarizer(threshold=100)
bi = binobj.fit_transform(x)

print(bi)


import pandas as pd
df = pd.read_excel('C:/Users/e.almaee/Desktop/Dataset/Heart data.xlsx')

binobj = Binarizer(threshold=220)
df = df.dropna(subset=['cholestoral '])
bi = binobj.fit(np.asarray(df['cholestoral ']).reshape(-1,1))
df['cholestoral '] = bi.transform(np.asarray(df['cholestoral ']).reshape(-1,1))
示例#17
0
from sklearn import preprocessing
import numpy as np
folder = 'group_3re/'
encoder = Binarizer()
final_data = np.array([])
size = 2

for i in range(0, size, 1):
    filename = str(i) + '.png'
    img = cv2.imread(folder + filename, cv2.IMREAD_GRAYSCALE)
    np.savetxt('img_0.txt', img)

    scaler = preprocessing.StandardScaler().fit(img)
    X_scaled = scaler.transform(img)
    np.savetxt('x_sc.txt', X_scaled)

    encoder.fit(X_scaled)
    img = encoder.transform(X_scaled)

    np.savetxt('img_re.txt', img, fmt='%i')
    final_data = np.append(final_data, img)

    print(i, 'done')

print('fin=', final_data)
print(final_data.shape)
final_data = final_data.reshape(size, 6400)
print('re_fin= ', final_data)
print(final_data.shape)
savemat('final_data.mat', {'random': final_data})
np.savetxt('final_data.txt', final_data, fmt='%i')
示例#18
0
import seaborn as sns
sns.set(style="darkgrid")
ax = sns.countplot(x="CODE_GENDER", data=train)

cols2bin_train = [
    c for c in train.columns
    if c.startswith("FLAG_") and len(train[c].value_counts(dropna=False)) == 2
]
cols2bin_test = [
    c for c in test.columns
    if c.startswith("FLAG_") and len(test[c].value_counts(dropna=False)) == 2
]

for c in cols2bin_train:
    binarizer = Binarizer()
    binarizer.fit(train[c].values.reshape(-1, 1))
    train[c] = binarizer.transform(train[c].values.reshape(-1, 1))
    test[c] = binarizer.transform(test[c].values.reshape(-1, 1))

flag_train = [c for c in train.columns if c.startswith("FLAG_")]
for c in flag_train[2:]:
    train.loc[:, c] = train[c].astype('bool')
    test.loc[:, c] = test[c].astype('bool')

for c in flag_train:
    print(c)
    print(train[c].unique())

le = LabelEncoder()
le.fit(train.FLAG_OWN_CAR)
train.FLAG_OWN_CAR = le.transform(train.FLAG_OWN_CAR)
示例#19
0
le.fit([1, 2, 2, 6])
le.classes_
le.transform([1, 1, 2, 6])
le.inverse_transform([0, 0, 2, 1])
le.fit(['a', 'a', 'b', 'b', 'c'])
le.classes_
le.fit_transform(['b', 'b', 'a', 'c'])
le.inverse_transform([0, 0, 1, 2, 2])


from sklearn.preprocessing import Binarizer
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
binarizer = Binarizer()
binarizer.fit(X)
binarizer.transform(X)
binarizer = Binarizer(threshold=1.1)
binarizer.transform(X)

from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'foo':1, 'bar':2}, {'foo':3, 'baz':1}]
X = v.fit_transform(D)
X
v.feature_names_
v.inverse_transform(X)
v.transform({'foo':4, 'unseen_feature':3})

from sklearn.preprocessing import FunctionTransformer
def all_b(x):
import numpy as np

X_data = np.array([[0.3, 0.6], [0.7, 0.5]])

from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.5)
binarizer.fit(X_data)
Binarizer(copy=True, threshold=0.5)

X_data = binarizer.transform(X_data)
print(X_data)
示例#21
0
    def binarizer_(self,X,thre):

        bin = Binarizer(threshold=0.0)
        bin.fit(X)
        return bin.transform(X)
def binarizer_usecase():
    X_train = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]])

    binarizer = Binarizer(threshold=1.1)
    binarizer.fit(X_train)
    print(binarizer.transform(X_train))
data_normalized = DNorm.transform(array)
set_printoptions(precision=2)
print("normalized data with l1: \n", data_normalized[0:3])

#L2 normalization
L2DNorm = Normalizer(norm='l2')
L2DNorm.fit(array)
L2data_normalized = L2DNorm.transform(array)
set_printoptions(precision=2)
print("normalized data with l2: \n", L2data_normalized[0:3])

# Binarization
from sklearn.preprocessing import Binarizer

my_binarizer = Binarizer(threshold=0.5)
my_binarizer.fit(array)
binarized_data = my_binarizer.transform(array)
print("our binarized data is: \n", binarized_data[0:3])

# Standarization
from sklearn.preprocessing import StandardScaler

DScal = StandardScaler()
DScal.fit(array)
data_standarted = DScal.transform(array)
set_printoptions(precision=2)
print("our standarized data is: \n", data_standarted[0:3])

# Labelling Encoding
from sklearn.preprocessing import LabelEncoder
示例#24
0
pd_age = pd.DataFrame(age, columns=['age'])
#%%分箱处理
'''1、监督分箱'''
'''1-1 pd.cut()相当于为成绩设置优、良、差的有序分箱,可看作一种等量分箱'''
bins = [0, 20, 50, 100]  #设置间隔
pd.cut(pd_age['age'], bins, labels=['A', 'B', 'C'])  #自己指定划分区间
pd.cut(pd_age['age'], 4, labels=['A', 'B', 'C', 'D'],
       retbins=True)  #指定bins的数量N,实现等量均分
'''1-2 pd.qcut()也是一种有序的等频分箱'''
pd.qcut(pd_age['age'], 4, retbins=True, labels=['A', 'B', 'C',
                                                'D'])  #指定划分组数实现等频分箱
#retbins返还划分区间,若标识为True,则返还两部分内容,一部分是Series,另一部分为numpy.array
'''1-3 sklearn实现二分类分箱'''
from sklearn.preprocessing import Binarizer
box = Binarizer(threshold=60)  #threshold是二分类的划分界限
box.fit(age)
box.fit_transform(age)
'''1-4 sklearn实现多分类分箱'''
from sklearn.preprocessing import KBinsDiscretizer
Kbox = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans')
#encode中onehot-dense返回密集数组,onehot返回稀疏矩阵,ordinal返回一列
#strategy中quantile表示等频分箱,uniform表示等量分箱,kmeans表示最接近中心点的分箱
Kbox.fit(age)
Kbox.fit_transform(age)
Kbox.bin_edges_  #查看分类边界
'''2、有监督分箱'''
'''2-1 卡方分箱'''
from scipy.stats import chi2
chi2.cdf(10, 3)  #输入:卡方值、自由度;输出:(1-P)值
chi2.sf(10, 3)  #输入:卡方值、自由度;输出:P值
chi2.ppf(0.9814338645369568, 3)  #输入:(1-P)值、自由度;输出:卡方值
import numpy as np

X_data = np.array([[0.3, 0.6], [0.7, 0.5]])

#Binarizer:二值化工具,将数据分为0和1,其只需要传入一个参数threshold(阈值),其小于或等于该值的数据会变为0,大于该值的数据会变为1。

from sklearn.preprocessing import Binarizer #导入库
binarizer = Binarizer(threshold=0.5) #实例化
binarizer.fit(X_data) #fit
#Binarizer(copy=True, threshold=0.5)  这表示的是一个输出

X_data = binarizer.transform(X_data) #transform
print(X_data)
示例#26
0
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    random_state=314)

print x_train.size, x_test.size

from sklearn.preprocessing import Binarizer
import numpy as np

binary = Binarizer(np.mean(x_train[:, 0]))

print binary

binary.fit(x_train[:, 0])

print binary.transform(x_train[:, 0])
print np.mean(binary.transform(
    x_train[:, 0]))  ## What percent of train is above the mean?

print np.mean(binary.transform(x_test[:, 0]))  ## Transform X_Test

# What happens if I refit binary?

binary = Binarizer(np.mean(x_test[:, 0]))
binary.fit(x_test[:, 0])
print np.mean(binary.transform(x_test[:, 0]))

print np.mean(x_train[:, 0]), np.mean(x_test[:, 0])