def feature_prosessing(): # 导入IRIS数据集 iris = load_iris() # 特征矩阵 print iris.data[:2] ,iris.target[:2] # 目标向量 StandardScaler().fit_transform(iris.data) MinMaxScaler().fit_transform(iris.data) print Binarizer(threshold=3).fit_transform(iris.data)[:2],iris.target.reshape((-1,1))[:2] #dummy one_hot= OneHotEncoder() one_hot.fit(iris.target.reshape(-1, 1) ) print one_hot.n_values ,one_hot.active_features_ ,one_hot.feature_indices_ ,one_hot.transform([[0]]).toarray() # 缺失值计算,返回值为计算缺失值后的数据 # 参数missing_value为缺失值的表示形式,默认为NaN # 参数strategy为缺失值填充方式,默认为mean(均值) print Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data))) [:2] print "PolynomialFeatures" , PolynomialFeatures().fit_transform(iris.data) [:4] ,len(PolynomialFeatures().fit_transform(iris.data) ) print "VarianceThreshold", VarianceThreshold(threshold=1).fit_transform(iris.data) [:4] ,len(VarianceThreshold(threshold=1).fit_transform(iris.data)[0]) # 选择K个最好的特征,返回选择特征后的数据 # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 # 参数k为选择的特征个数 print iris.data[:2], iris.target[:2] # print SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data[:2], iris.target[:2]) # 选择K个最好的特征,返回选择特征后的数据 # print SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # 递归特征消除法,返回特征选择后的数据 # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 print RFE(estimator=LogisticRegression(), n_features_to_select=3).fit_transform(iris.data, iris.target) [:2] # 带L1惩罚项的逻辑回归作为基模型的特征选择 print SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target) [:2] #L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。故,可结合L2惩罚项来优化。具体操作为: # 若一个特征在L1中的权值为1, 选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合,将这一集合中的特征平分L1中的权值, # 故需要构建一个新的逻辑回归模型 # 带L1和L2惩罚项的逻辑回归作为基模型的特征选择 # 参数threshold为权值系数之差的阈值 print SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target)[:2] #基于树模型的特征选择法 #树模型中GBDT也可用来作为基模型进行特征选择,使用feature_selection库的SelectFromModel类结合GBDT模型,来选择特征的代码如下: from sklearn.ensemble import GradientBoostingClassifier # GBDT作为基模型的特征选择 print SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target) [:2]
def zero_one_dl(x: Union[np.ndarray, pd.Series]) -> np.ndarray: """Returns ones for positive values of "diff-log" series, and zeros for negative values.""" x = x.astype("float64") dl_trans = FunctionTransformer(diff_log) zero_one_pipe = make_pipeline( dl_trans, SimpleImputer(strategy="constant", fill_value=0.0), Binarizer(threshold=0.0)) zero_one = zero_one_pipe.fit_transform(x) return zero_one
def BinOutcome(dataset): combatPts = [] for poke in dataset: combatPts.append(poke.ptOut) meanPtOut = np.mean(combatPts) combatPts = np.array(combatPts) combatPts = combatPts.reshape(1, -1) binarizerP = Binarizer(threshold=meanPtOut) return binarizerP.fit_transform(combatPts)
def preprocess(logits, labels): logits = toone(logits) bin = Binarizer(threshold=0.2) for i in range(len(logits)): if logits[i][Data.intentdict[1]['none']] > 0.5 and labels[i][ Data.intentdict[1]['none']] > 0.5: logits[i][Data.intentdict[1]['none']] = int(0) labels[i][Data.intentdict[1]['none']] = int(0) logits = bin.fit_transform(logits) labels = bin.fit_transform(labels) return logits.flatten(), labels.flatten()
def run_binarizer(): x = [[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9], [1, 7, 2, 6, 2, 7, 2], [3, 8, 6, 2, 8, 3, 8]] print(x) binarizer = Binarizer(threshold=4) print(binarizer.transform(x)) pass
def getTrigramFeatures(data): vectorizer = CountVectorizer( analyzer='word', lowercase=False, ngram_range=(1, 3), ) features = vectorizer.fit_transform(data) bin = Binarizer() tgFt = bin.fit_transform(features) return tgFt, vectorizer
def test_validate_sklearn_linear_models_multiclass(self): df = pd.DataFrame(data=self.X, columns=self.features) df['species'] = self.y model = LogisticRegression() pipe = Pipeline([('mapper', DataFrameMapper([(['sepal length (cm)'], Binarizer()) ])), ('model', model)]) pipe.fit(df[self.features], df.species) file_name = 'linear_model_multi_class_classification.pmml' skl_to_pmml(pipe, self.features, 'species', file_name) self.assertEqual(self.schema.is_valid(file_name), True)
def test_binarizer(): x = [ [1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [3, 3, 3, 3, 3], [1, 1, 1, 1, 1] ] from sklearn.preprocessing import Binarizer print("before transform:", x) binarizer = Binarizer(threshold=2.5) # threshold参数指定了属性的阈值 print("after transform:", binarizer.transform(x))
def test_onnx_binarizer_converter_raises_rt(self): warnings.filterwarnings("ignore") X = np.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=np.float32) model = Binarizer(threshold=0) model.fit(X) # generate test input onnx_ml_model = convert_sklearn(model, initial_types=[("float_input", FloatTensorType_onnx(X.shape))]) onnx_ml_model.graph.node[0].attribute[0].name = "".encode() self.assertRaises(RuntimeError, convert, onnx_ml_model, "onnx", X)
def split_according_to_threshold(y, threshold): effective_threshold = threshold L = len(y) result = sum( Binarizer(threshold=effective_threshold).fit_transform( y.reshape(-1, 1))) / L tol = 0.01 if not (threshold - tol < result < threshold + tol): protect = 0 while (not (threshold - tol < result < threshold + tol) ) and protect < 100: if threshold - tol < result: effective_threshold += 0.01 else: effective_threshold -= 0.01 result = sum( Binarizer(threshold=effective_threshold).fit_transform( y.reshape(-1, 1))) / L protect += 1 return Binarizer(threshold=effective_threshold).fit_transform( y.reshape(-1, 1))
def test_binarizer(self): data = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]], dtype=np.float64) model = Binarizer(threshold=0.5) model_onnx = convert_sklearn( model, "scikit-learn binarizer", [("input", DoubleTensorType(data.shape))]) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnBinarizerDouble-SkipDim1")
def __labelBinarizer(self,threshold): """---labelBinarizer----------------------------------------- Values greater than the threshold map to 1, while values less than or equal to the threshold map to 0. ---Parameters threshold : float ---Return None ------------------------------------------------------""" from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=threshold) self.__yData = binarizer.fit_transform(self.__yData)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1): "8/1", (6, 1): "6/1", (4, 1): "4/1", (6, 2): "6/2", (4, 2): "4/2", (6, 3): "6/3", (4, 3): "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value="other"), LabelBinarizer() ]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["model_year", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy( SelectFromModel(RandomForestRegressor(random_state=13, n_estimators=3), threshold="1.25 * mean")) ]), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = PMMLPipeline([("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name)
def test_onnx_helper_load_save_init_meta(self): model = make_pipeline(Binarizer(), OneHotEncoder(sparse=False), StandardScaler()) X = numpy.array([[0.1, 1.1], [0.2, 2.2], [0.4, 2.2], [0.2, 2.4]]) model.fit(X) model_onnx = convert_sklearn(model, "pipe3", [("input", FloatTensorType([None, 2]))]) meta = {'pA': 'one', 'pB': 'two'} onnx.helper.set_model_props(model_onnx, meta) new_model = select_model_inputs_outputs(model_onnx, "variable") vals = {p.key: p.value for p in new_model.metadata_props} assert vals == meta
def build_proba_MNB(X, y, binary=True, verbose=False): """build multinomial Naive Bayes classifier that accepts probabilistic labels feature encoding should be binarized""" clf = ProbaLabelMNB(alpha=0.1) if binary: clf = Pipeline([('binarizer', Binarizer()), ('clf', clf)]) model = clf.fit(X, y) return model # equations from "partially supervised classification of text documents"
def make_models(X, y, y_bin): return dict(ols=LinearRegression().fit(X, y), lr_bin=LogisticRegression().fit(X, y_bin), lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y), lr_mn=LogisticRegression(solver='lbfgs', multi_class='multinomial').fit(X, y), svc=SVC(kernel='linear').fit(X, y_bin), svr=SVR(kernel='linear').fit(X, y), dtc=DecisionTreeClassifier(max_depth=4).fit(X, y), dtr=DecisionTreeRegressor(max_depth=4).fit(X, y), rfc=RandomForestClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), rfr=RandomForestRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbc=GradientBoostingClassifier(n_estimators=3, max_depth=3, random_state=1).fit(X, y), gbr=GradientBoostingRegressor(n_estimators=3, max_depth=3, random_state=1).fit(X, y), abc=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y), abc2=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y), abc3=AdaBoostClassifier(algorithm='SAMME', n_estimators=3, random_state=1).fit(X, y_bin), abc4=AdaBoostClassifier(algorithm='SAMME.R', n_estimators=3, random_state=1).fit(X, y_bin), km=KMeans(1).fit(X), km2=KMeans(5).fit(X), pc1=PCA(1).fit(X), pc2=PCA(2).fit(X), pc3=PCA(2, whiten=True).fit(X), mlr1=MLPRegressor([2], 'relu').fit(X, y), mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y), mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y), mlc=MLPClassifier([2, 2], 'tanh').fit(X, y), mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin), bin=Binarizer(0.5), mms=MinMaxScaler().fit(X), mas=MaxAbsScaler().fit(X), ss1=StandardScaler().fit(X), ss2=StandardScaler(with_mean=False).fit(X), ss3=StandardScaler(with_std=False).fit(X), n1=Normalizer('l1'), n2=Normalizer('l2'), n3=Normalizer('max'))
def preprocess_data(X, n=3, suffix='', binarize=True, return_vect=False): cv = CountVectorizer( CharNGramAnalyzer(min_n=1, max_n=n, preprocessor=SimplePreprocessor(suffix))) X = cv.fit_transform(X) if binarize: X = Binarizer(copy=False).transform(X) if return_vect: return X, cv else: return X
def test_model_binarizer(self): data = np.array([[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]], dtype=np.float32) model = Binarizer(threshold=0.5) model_onnx = convert_sklearn(model, "scikit-learn binarizer", [("input", FloatTensorType(data.shape))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnBinarizer-SkipDim1")
def main(): PATH = "../pima-indians-diabetes.data.csv" columns = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] df = read_csv(PATH, names=columns) array = df.values X = array[:, 0:8] Y = array[:, 8] binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) set_printoptions(precision=3) print(binaryX[0:5, :])
def test_query_many(): """Test for the query_many method """ data = np.random.randn(20, 6) bindata = Binarizer(threshold=0.5).fit_transform(data) model = BernoulliBayesianSet(bindata) individual = [] queries = [[0, 1, 2], [1, 2, 3], [2, 3, 4]] for query in queries: individual.append(model.query(query)) total = model.query_many(queries) for rank in individual: assert rank in total
def test_onnxrt_python_Binarizer(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) clr = Binarizer() clr.fit(X_train, y_train) model_def = to_onnx(clr, X_train.astype(numpy.float32)) oinf = OnnxInference(model_def) got = oinf.run({'X': X_test}) self.assertEqual(list(sorted(got)), ['variable']) exp = clr.transform(X_test) self.assertEqualArray(exp, got['variable'], decimal=6)
def get_clf(n=3, binarize=True, clf=None): steps = [('vectorizer', CountVectorizer( CharNGramAnalyzer(min_n=1, max_n=n, preprocessor=SimplePreprocessor())))] if binarize: steps.append(('binarizer', Binarizer(copy=False))) if not clf: clf = naive_bayes.BernoulliNB() elif not clf: clf = naive_bayes.MultinomialNB() steps.append(('clf', clf)) return Pipeline(steps)
def Binz(df, target): # split into X and y datasets X_init = df.drop(target, axis=1) y_init = df[target] dum = Binarizer() scaled = RobScale(df) print('Binarizer fitting...') fit = dum.fit(scaled) print('Binarizer transforming...') dfit = pd.DataFrame(fit.transform(scaled)) # drop any NaNs that may have been made (there were few in the landslides vectorization) dfity = pd.concat([dfit, y_init], axis=1, join_axes=[y_init.index]).dropna() print('The encoded data has shape:',dfity.shape,'\n\n') return dfity
def informationGain(texts, labels, nFeatures = 10000): vectorizer = CountVectorizer(token_pattern = '[a-zA-Z]+', stop_words='english') bow = vectorizer.fit_transform(texts) transformer = Binarizer().fit(bow) bow = transformer.transform(bow) names = vectorizer.get_feature_names() if nFeatures != -1: pos_train = [] neg_train = [] for i in range(0,len(labels_train)): if labels_train[i] == -1.0: neg_train.append(i) else: pos_train.append(i) pos_matrix = bow.tocsr()[pos_train,:] neg_matrix = bow.tocsr()[neg_train,:] diff = [abs(x - y) for x,y in zip(pos_matrix.mean(axis = 0).tolist()[0], neg_matrix.mean(axis = 0).tolist()[0])] indexes = [] indexes_sorted = [i[0] for i in sorted(enumerate(diff), key=lambda x:x[1])] names_sorted = [names[i] for i in indexes_sorted] indexes = indexes_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)] names = names_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)] bow = bow.tocsr()[:,indexes] info_gain = {} labels_entropy = entropy(labels) count = 0 for w in names: count += 1 if count%500 == 0: print(count/bow.shape[1]*100) texts_with_w_labels = [] texts_without_w_labels = [] index = names.index(w) column = bow[:,index] with_indices = find(column)[0].tolist() texts_with_w_labels = [labels[i] for i in list(range(0,len(labels))) if i in with_indices ] texts_without_w_labels = [labels[i] for i in list(range(0,len(labels))) if i not in with_indices ] info_gain_w = labels_entropy - (float(len(texts_with_w_labels))/float(len(labels))) * entropy(texts_with_w_labels) -(float(len(texts_without_w_labels))/float(len(labels))) * entropy(texts_without_w_labels) info_gain[w] = info_gain_w return info_gain
def test_model_binarizer(self): model = Binarizer(threshold=0.5) model_onnx = convert_sklearn( model, "scikit-learn binarizer", [("input", FloatTensorType([None, 1]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( numpy.array([[1, 1]], dtype=numpy.float32), model, model_onnx, basename="SklearnBinarizer-SkipDim1", )
def words_killer(self, train, data, method, words_num_names_kept=50): if method == 'nb': normalizer = Binarizer() normalized_data = pd.DataFrame(normalizer.fit_transform(data)) normalized_data.index = data.index train_data = pd.concat([normalized_data, train['label_class']], axis=1, join='inner') clf = BernoulliNB() clf.fit(train_data.drop('label_class', axis=1), train_data['label_class']) print( 'words killer auc: ', cross_val_score(clf, train_data.drop('label_class', axis=1), train_data['label_class'], scoring='roc_auc')) fe = pd.Series(clf.coef_[0]) fe.index = data.columns fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept] return data[fe.index] if method == 'pca': clf = PCA(n_components=words_num_names_kept) train_data = pd.DataFrame(clf.fit_transform(data)) train_data.index = data.index return train_data if method == 'lg': normalizer = MinMaxScaler() normalized_data = pd.DataFrame(normalizer.fit_transform(data)) normalized_data.index = data.index train_data = pd.concat([normalized_data, train['label_class']], axis=1, join='inner') clf = LogisticRegression(class_weight='balanced') clf.fit(train_data.drop('label_class', axis=1), train_data['label_class']) print( 'words killer auc: ', cross_val_score(clf, train_data.drop('label_class', axis=1), train_data['label_class'], scoring='roc_auc')) fe = pd.Series(clf.coef_[0]) fe.index = data.columns fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept] return data[fe.index] else: return data
def preprocess(df): data = {} verbose = True df = df[df['PER'].between(0,20000)].sample(frac=1) print(df.head()) x, y = df.to_numpy()[:,:-1].reshape(-1,1), df.to_numpy()[:,-1].reshape(-1,1) f,ax = plt.subplots(1,2) ax[0].hist(x) ax[0].set_title('x prev') ax[1].hist(y) ax[1].set_title('y prev') if verbose: plt.show() if False: for s in [MinMaxScaler]:#[PowerTransformer, StandardScaler]: x=s().fit_transform(x) y=s().fit_transform(y) f,ax = plt.subplots(1,2) ax[0].hist(x) ax[0].set_title('x post') ax[1].hist(y) ax[1].set_title('y post') if verbose: plt.show() else: plt.close() if False: answ = input('please insert the radius range: e.g. (x y)') rmin = float(answ.split(' ')[0]) rmax = float(answ.split(' ')[1]) y = np.asarray([[1] if rmin<x[0]<rmax else [0] for x in y]) elif True: y = Binarizer(threshold=np.mean(y)).fit_transform(y) #y = y L = df.shape[0] divider = {'train':slice(0,int(0.7*L)), 'val':slice(int(0.7*L),int((0.7+0.15)*L)), 'test':slice(-int(0.15*L),None),} for k,i in divider.items(): data[k] = (x[i],y[i]) print(f'for key {k} {np.count_nonzero(data[k][1])/len(data[k][1])*100}% are non-zero') answ = input('if you are happy with the ratio, press "y"... else "n"') if answ=='y': return data else: return preprocess(df)
def Binarize_Dataset(): s = start_date() e = end_date() sym = input_symbol() df = yf.download(sym, s, e) array = df.values X = array[:, 0:5] Y = array[:, 5] # initialising the binarize binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) np.set_printoptions(precision=3) print( 'Binarize values equal or less than 0 are marked 0 and all of those above 0 are marked 1' ) print(binaryX[0:5, :]) print("") # Splitting the datasets into training sets and Test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) sc_X = StandardScaler() # Splitting the datasets into training sets and Test sets X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test) print("Training Dataset") print(X_train) print("") print(Y_train) print("") print("Testing Dataset") print(X_test) print("") print(Y_test) print("") ans = ['1', '2'] user_input = input(""" What would you like to do next? Enter option 1 or 2. 1. Menu 2. Exit Command: """) while user_input not in ans: print("Error: Please enter a a valid option 1-2") user_input = input("Command: ") if user_input == "1": menu() elif user_input == "2": exit()
def binarize(self, X): """ Uses a stored threshold to binarize predictions """ if not self.get('thresh'): self.set_thresh() if not self.get('binarizer'): self.set('binarizer', Binarizer(threshold=self.get('thresh'))) X = self.get('binarizer').transform(X.values.reshape(1, -1)) X = pandasize(X) return X
def test_sklearn_22(self): iris = datasets.load_iris() irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd['Species'] = iris.target features = irisd.columns.drop('Species') target = 'Species' model = LogisticRegression() pipeline_obj = Pipeline([("scaler", Binarizer(threshold=2)), ("model", model)]) pipeline_obj.fit(irisd[features], irisd[target]) skl_to_pmml(pipeline_obj, features, target, "binarizer.pmml") self.assertEqual(os.path.isfile("binarizer.pmml"), True)