def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, 0]]) for init in (np.array, sp.csr_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert X_bin is not X assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) assert X_bin is X X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4)
def predict_bay(df_test_input, param_1, param_2): df_train_input = param_1 df_train_output = param_2 # Binarization transformer = Binarizer().fit(df_train_input) df_train_input_ = pd.DataFrame(transformer.transform(df_train_input)) transformer = Binarizer().fit(df_test_input) df_test_input_ = pd.DataFrame(transformer.transform(df_test_input)) # PCA # Choose the number of components by our own number_principal_components = 100 pca = PCA(n_components=number_principal_components) pca.fit(df_train_input_) principal_components_train = pca.transform(df_train_input_) # calculate the PCs for the test data as well principal_components_test = pca.transform(df_test_input_) # making the data non-negative lowest_num = 0 if (principal_components_test.min() < principal_components_train.min()): lowest_num = principal_components_test.min() else: lowest_num = principal_components_train.min() principal_components_train = abs(lowest_num) + principal_components_train principal_components_test = abs(lowest_num) + principal_components_test # Bayes bayes = GaussianNB() bayes.fit(principal_components_train, df_train_output['class'].values) bayes_labels_pred = pd.DataFrame(bayes.predict(principal_components_test)) return bayes_labels_pred, number_principal_components
def use_Binarizer(): x = [[1., -1, 2.], [2., 0., 0.], [0., 1., -1.]] scaler = Binarizer() scaler.fit(x) # 필요없음. print(scaler.transform(x)) scaler = Binarizer(threshold=1.5) print(scaler.transform(x)) # Binarizer 단순 버전 print(preprocessing.binarize(x))
def ExtractWordFeaturesWithDataframes(train_dataset_df, test_dataset_df, vectorizer_type="CountVectorizer", ngrams=None, balance_dataset=False, remove_center_interval=None): # Main logic of the method ExtractWordFeatures. (train_speeches, Y_train) = extractTextsAndLabelsFromDf( train_dataset_df, balance_dataset=balance_dataset, remove_center_interval=remove_center_interval) (test_speeches, Y_test) = extractTextsAndLabelsFromDf( test_dataset_df, balance_dataset=balance_dataset, remove_center_interval=remove_center_interval) if vectorizer_type == "CountVectorizer": if ngrams != None: vectorizer = CountVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]+', ngram_range=(1, ngrams)) else: vectorizer = CountVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]+') if vectorizer_type == "HashingVectorizer": vectorizer = CountVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]+') if vectorizer_type == "TfidfVectorizer": if ngrams != None: vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]+', ngram_range=(1, ngrams)) else: vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]+') X_train = vectorizer.fit_transform(train_speeches) X_test = vectorizer.transform(test_speeches) if vectorizer_type == "HashingVectorizer": transformer = Binarizer().fit(X_train) X_train = transformer.transform(X_train) transformer = Binarizer().fit(X_test) X_test = transformer.transform(X_test) feature_names = vectorizer.get_feature_names() return X_train, Y_train, X_test, Y_test, vectorizer, feature_names
def test_binarizer(): b = Binarizer(np.mean(X)) inputs = ['x{0}'.format(i + 1) for i in range(X.shape[1])] expr = skompile(b.transform, inputs) assert np.all( b.transform(X) == np.asarray( [expr.evaluate(x1=x[0], x2=x[1], x3=x[2], x4=x[3]) for x in X]))
def get_eval_by_threshold(y_test, pred_proba_c1, thresholds): # thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행. for custom_threshold in thresholds: binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) custom_predict = binarizer.transform(pred_proba_c1) print('임곗값:', custom_threshold) get_clf_eval(y_test, custom_predict, pred_proba_c1)
def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20): n = n_alphas*n_ks cv_mean = np.empty(n) cv_std = np.empty(n) regressors = pd.DataFrame() binarizer = Binarizer(threshold=1400) y_binary = binarizer.transform(y).transpose().ravel() itt_counter = 0 print 'size n_a: %d n_k: %d' %(n_a, n_k) for i in range (0, n_a): print 'reg. column : %d' %(i*n_k) temp_string = 'alpha=%f' %alphas[i*n_k] print temp_string print regressors.shape df_temp = pd.DataFrame() print 'computing for alpha = %f' %(alphas[n_ks*i]) X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k]) regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1) for j in range(0, n_k): print 'i:%d, j:%d' %(i, j) print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j]) print 'X_lasso shape:' print X_lasso.shape cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv) itt_counter = itt_counter + 1 print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j) return cv_mean, cv_std, regressors
def get_feature_vectors(self, emails_bodies): #create a vectoriser vectorizer = TfidfVectorizer(analyzer='word', strip_accents=None, ngram_range=(1, 1), max_features=self.max_features, stop_words='english', norm=None) #train it on the emails body vectorizer = vectorizer.fit(emails_bodies) #transform the raw emails body into feature vectors features_vectors = vectorizer.transform( tqdm(emails_bodies, desc=" Creating emails feature vector")) #created a binarizer that turns the TF-IDF features into binary feature vectors # (0 for non occurance and 1 for occurance) binarizer = Binarizer().fit(features_vectors) #needed for good word attack features_bin = binarizer.transform(features_vectors) #get the feature names, vocabulary and weights feature_names = vectorizer.get_feature_names() features_with_indices = vectorizer.vocabulary_ features_weights = vectorizer.idf_ return features_vectors, feature_names, features_with_indices, features_weights, features_bin
def myxgb(x_train, x_test, y_train, y_test): params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 0.1, 'max_depth': 8, 'alpha': 0, 'lambda': 0, 'subsample': 0.7, 'colsample_bytree': 0.5, 'min_child_weight': 3, 'silent': 0, 'eta': 0.03, 'nthread': -1, 'seed': 2019, } num_round = 180 dtrain = xgb.DMatrix(x_train, y_train) bst = xgb.train(params, dtrain, num_round) #pickle.dump(bst,open("xgboostclass2.dat","wb")) dtest = xgb.DMatrix(x_test, y_test) #loaded_model = pickle.load(open("xgboostclass2.dat","rb")) ypreds = bst.predict(dtest) #print(y_test) #print(ypreds) bn = Binarizer(threshold=0.42444044) ypreds = bn.transform(ypreds.reshape(-1, 1)) print("myxgb精度为:", accuracy_score(y_test, ypreds))
def rescaleData(cls): dataframe = read_csv(cls.filename, names=cls.names) array = dataframe.values #separate array into inpput and output components X = array[:, 0:8] y = array[:, 8] print("\nRescaled with MinMaxScaler") scaler = MinMaxScaler(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X) #summarize transformed data set_printoptions(precision=3) print(rescaledX[0:5, :]) print("\nRescaled with StandardScaler") scaler = StandardScaler().fit(X) rescaledX = scaler.transform(X) #summarize transformed data set_printoptions(precision=3) print(rescaledX[0:5, :]) print("\nRescaled with Normalizer") scaler = Normalizer().fit(X) normalizedX = scaler.transform(X) #summarize trasnformed data set_printoptions(precision=3) print(normalizedX[0:5, :]) binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) #summarize transformed data set_printoptions(precision=3) print(binaryX[0:5, :])
def binarize_data(): from sklearn.preprocessing import Binarizer array = load_data() x = array[:, 0:8] y = array[:, 8] binarizer = Binarizer(threshold=0.0).fit(x) binaryx = binarizer.transform(x) return binaryx, binaryx[0:5, :]
def binarizer(df): """ 根据阈值对数据进行二值化(将特征值设置为0或1) """ X = df.values transformer = Binarizer().fit(X) # fit does nothing. matrix = transformer.transform(X) return matrix
def evalate(threshhold,meridian,result_data,true_data ): meridian_names = ['LUNG', 'SPLEEN', 'STOMACH', 'HEART', 'KIDNEY', 'LIVER', 'LARGE INTESTINE'] binarizer = Binarizer(threshhold).fit(result_data) result_binary = pd.DataFrame(binarizer.transform(result_data),columns=meridian_names) eva_scores = list(map(lambda x: x(true_data[meridian], result_binary[meridian]), list_of_functions)) auc_score = roc_auc_score(true_data[meridian],result_data[meridian] ) eva_scores.append(auc_score) return eva_scores
def get_eval_by_threshold(y_test, pred_proba_c1, threshold): # threshold list 객체 내의 값을 차례로 iteration 하면서 Evaluation 수행 for custom_th in threshold: binarizer = Binarizer(threshold = custom_th).fit(pred_proba_c1) custom_pred = binarizer.transform(pred_proba_c1) print('임계값 :', custom_th) get_clf_eval(y_test, custom_pred)
def cv_mean_std_array(X, y, alphas, n_a, cv=20): binarizer = Binarizer(threshold=1400) y_binary = binarizer.transform(y).transpose().ravel() cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a) for i in range (0, n_a): print 'computing for alpha=%f' %alphas[i] cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i]) print 'successfully computed iteration %d' %i return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
def initialize(): images, labels = load_mnist_data() binarizer = Binarizer().fit(images) images_binarized = binarizer.transform(images) knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard') knn.fit(images_binarized, labels) return knn
def convert_bin(self, feat): feat_bin = Binarizer(threshold=0.0).fit(feat.reshape(-1, 1)) feat_bin = feat_bin.transform(feat.reshape(-1, 1)).squeeze() feat_bin = ''.join( [bin(int(i)).replace('0b', '') for i in list(feat_bin)]) # binary/16bit---->int feat_bin = re.findall(r'.{16}', feat_bin) feat_int = np.array([int(i, base=2) for i in feat_bin]) return feat_int
def run_binarizer(): x = [[1, 2, 3, 4, 5, 6, 7], [3, 4, 5, 6, 7, 8, 9], [1, 7, 2, 6, 2, 7, 2], [3, 8, 6, 2, 8, 3, 8]] print(x) binarizer = Binarizer(threshold=4) print(binarizer.transform(x)) pass
def test_binarizer(): x = [ [1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [3, 3, 3, 3, 3], [1, 1, 1, 1, 1] ] from sklearn.preprocessing import Binarizer print("before transform:", x) binarizer = Binarizer(threshold=2.5) # threshold参数指定了属性的阈值 print("after transform:", binarizer.transform(x))
def main(): PATH = "../pima-indians-diabetes.data.csv" columns = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] df = read_csv(PATH, names=columns) array = df.values X = array[:, 0:8] Y = array[:, 8] binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) set_printoptions(precision=3) print(binaryX[0:5, :])
def test_onnxrt_python_Binarizer(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) clr = Binarizer() clr.fit(X_train, y_train) model_def = to_onnx(clr, X_train.astype(numpy.float32)) oinf = OnnxInference(model_def) got = oinf.run({'X': X_test}) self.assertEqual(list(sorted(got)), ['variable']) exp = clr.transform(X_test) self.assertEqualArray(exp, got['variable'], decimal=6)
def test_binarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Binarizer # with sklearn.preprocessing.Binarizer binarizerr = BinarizerR() binarizerr.fit(np.concatenate(trajs)) binarizer = Binarizer() binarizer.fit(trajs) y_ref1 = binarizerr.transform(trajs[0]) y1 = binarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def test_binarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.Binarizer # with sklearn.preprocessing.Binarizer binarizerr = BinarizerR() binarizerr.fit(np.concatenate(trajs)) binarizer = Binarizer() binarizer.fit(trajs) y_ref1 = binarizerr.transform(trajs[0]) y1 = binarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def informationGain(texts, labels, nFeatures = 10000): vectorizer = CountVectorizer(token_pattern = '[a-zA-Z]+', stop_words='english') bow = vectorizer.fit_transform(texts) transformer = Binarizer().fit(bow) bow = transformer.transform(bow) names = vectorizer.get_feature_names() if nFeatures != -1: pos_train = [] neg_train = [] for i in range(0,len(labels_train)): if labels_train[i] == -1.0: neg_train.append(i) else: pos_train.append(i) pos_matrix = bow.tocsr()[pos_train,:] neg_matrix = bow.tocsr()[neg_train,:] diff = [abs(x - y) for x,y in zip(pos_matrix.mean(axis = 0).tolist()[0], neg_matrix.mean(axis = 0).tolist()[0])] indexes = [] indexes_sorted = [i[0] for i in sorted(enumerate(diff), key=lambda x:x[1])] names_sorted = [names[i] for i in indexes_sorted] indexes = indexes_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)] names = names_sorted[len(indexes_sorted)-nFeatures:len(indexes_sorted)] bow = bow.tocsr()[:,indexes] info_gain = {} labels_entropy = entropy(labels) count = 0 for w in names: count += 1 if count%500 == 0: print(count/bow.shape[1]*100) texts_with_w_labels = [] texts_without_w_labels = [] index = names.index(w) column = bow[:,index] with_indices = find(column)[0].tolist() texts_with_w_labels = [labels[i] for i in list(range(0,len(labels))) if i in with_indices ] texts_without_w_labels = [labels[i] for i in list(range(0,len(labels))) if i not in with_indices ] info_gain_w = labels_entropy - (float(len(texts_with_w_labels))/float(len(labels))) * entropy(texts_with_w_labels) -(float(len(texts_without_w_labels))/float(len(labels))) * entropy(texts_without_w_labels) info_gain[w] = info_gain_w return info_gain
class BinarizerImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def Binarize_Dataset(): s = start_date() e = end_date() sym = input_symbol() df = yf.download(sym, s, e) array = df.values X = array[:, 0:5] Y = array[:, 5] # initialising the binarize binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) np.set_printoptions(precision=3) print( 'Binarize values equal or less than 0 are marked 0 and all of those above 0 are marked 1' ) print(binaryX[0:5, :]) print("") # Splitting the datasets into training sets and Test sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) sc_X = StandardScaler() # Splitting the datasets into training sets and Test sets X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test) print("Training Dataset") print(X_train) print("") print(Y_train) print("") print("Testing Dataset") print(X_test) print("") print(Y_test) print("") ans = ['1', '2'] user_input = input(""" What would you like to do next? Enter option 1 or 2. 1. Menu 2. Exit Command: """) while user_input not in ans: print("Error: Please enter a a valid option 1-2") user_input = input("Command: ") if user_input == "1": menu() elif user_input == "2": exit()
def test_Binarizer(): ''' test Binatizer method :return: None ''' X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [ 3, 3, 3, 3, 3, ], [1, 1, 1, 1, 1]] print("before transform:", X) binarizer = Binarizer(threshold=2.5) print("after transform:", binarizer.transform(X))
def test_Binarizer(): ''' 测试 Binarizer 的用法 :return: None ''' X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [ 3, 3, 3, 3, 3, ], [1, 1, 1, 1, 1]] print("before transform:", X) binarizer = Binarizer(threshold=2.5) #threshold:阈值设定,高于阈值为1,低于阈值为0 print("after transform:", binarizer.transform(X))
def create_target(data, threshold=0.0): ''' Create target variable that is binary {0,1}. Split into X and y. data: dataframe ''' data1 = data.dropna().copy() binarizer = Binarizer(threshold=threshold) target = binarizer.transform(data1[('Returns','Next_Month')].values.reshape(-1,1)) data1 = data1.join(pd.DataFrame(target, columns=pd.MultiIndex.from_product([['Returns'], ['Target']]), index=data1.index)) return data1
def binarize(): for key, value in userScript.userDefinedBinarizeColumns.items(): if dataType.dataType(key, df) != "str": #user defined threshold userThreshold = value[0] col = key binarizeColumn = df.filter([col], axis=1) df = df.drop(col, axis=1) array = binarizeColumn.values binarizer = Binarizer(threshold=userThreshold).fit(array) binary = binarizer.transform(array) df[col] = binary else: print("The column, ", col, "is of type: string. Cannot binarize")
class BinarizerTransformer(NumericTransformer): def __init__(self, column_id, threshold=0.0): NumericTransformer.__init__(self, column_id, "binary", 1) self.threshold = threshold self.model = Binarizer(self.threshold) def transform1(self, column_data): where_are_NaNs = np.isnan(column_data) column_data[where_are_NaNs] = -1 return np.matrix(self.model.transform(column_data.reshape(1, -1))).T def transform(self, dataset, ids): column_data = np.array(dataset.values[ids, self.column_id], dtype=np.float64) return self.transform1(column_data)
def test_binarizer_converter(self): data = np.array([[1, 2, -3], [4, -3, 0], [0, 1, 4], [0, -5, 6]], dtype=np.float32) data_tensor = torch.from_numpy(data) for threshold in [0.0, 1.0, -2.0]: model = Binarizer(threshold=threshold) model.fit(data) torch_model = hummingbird.ml.convert(model, "torch") self.assertIsNotNone(torch_model) np.testing.assert_allclose( model.transform(data), torch_model.transform(data_tensor), rtol=1e-06, atol=1e-06, )
def test_default(self): X_train, X_test, y_train, y_test, feature_names = self.load_data() actual = BinarizerComponent() config = self.get_default(actual) actual.set_hyperparameters(config) actual.fit(X_train, y_train) X_actual = actual.transform(np.copy(X_test)) expected = Binarizer() expected.fit(X_train, y_train) X_expected = expected.transform(X_test) assert actual.get_feature_names_out(feature_names).tolist() == feature_names assert repr(actual.estimator_) == repr(expected) assert np.allclose(X_actual, X_expected)
def ge_transform(df_GE, genes): scaler = MinMaxScaler() print(len(df_GE)) binarizer = Binarizer(threshold=threshold_binarize) df_features = df_GE.transpose() print(len(df_features)) df_features = df_features.groupby(df_features.columns, axis=1).agg(max) df_features = df_features[genes] scaler.fit(df_features) df_features = scaler.transform(df_features) binarizer.fit(df_features) df_features = binarizer.transform(df_features) print(len(df_features)) df_features = pd.DataFrame(df_features) df_features.columns = genes return df_features
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, -1]]) for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) X_bin = binarizer.transform(X) assert_equal(sparse.issparse(X), sparse.issparse(X_bin)) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if init is not list: assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(threshold=-0.5, copy=True) for init in (np.array, list): X = init(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 1) assert_equal(np.sum(X_bin == 1), 5) X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger)) ]) pp_pipeline = Pipeline([ ('removedupes',RemoveDuplicateCols(logger=logger)), ('featureextraction',featureunion1), ('bounder',Bounder(inf,-inf)) ]) #%% idvs_raw = numpy.load(datafilename + ".npy") dvs = numpy.load(datafilename + "_dvs.npy") dvs_binary = binarizer.transform(dvs).reshape((dvs.shape[0],)) idvs = pp_pipeline.fit_transform(idvs,dvs_binary) logger.debug("Building models with %s idvs", idvs.shape[1]) #%% Loss models #corrs = numpy.array([numpy.abs(numpy.corrcoef(dvs_binary.T,idvs[:,i])[0,1]) for i in xrange(idvs.shape[1])]) #corrs2 = numpy.array([numpy.abs(numpy.corrcoef(dvs_binary.T,idvs2[:,i])[0,1]) for i in xrange(idvs2.shape[1])]) #idvs3 = numpy.hstack((idvs[:,numpy.where(corrs>0.145)[0]],idvs2[:,numpy.where(corrs2>0.11)[0]],)) #print idvs3.shape idvs = Bounder(inf,-inf).transform(idvs)
# # Binarization # In[6]: watched = np.array(popsong_df['listen_count']) watched[watched >= 1] = 1 popsong_df['watched'] = watched popsong_df.head(10) # In[7]: from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=0.9) pd_watched = bn.transform([popsong_df['listen_count']])[0] popsong_df['pd_watched'] = pd_watched popsong_df.head(11) # # Rounding # In[8]: items_popularity = pd.read_csv('datasets/item_popularity.csv', encoding='utf-8') items_popularity # In[9]: items_popularity['popularity_scale_10'] = np.array(np.round((items_popularity['pop_percent'] * 10)), dtype='int')
varSizeStatisticsTrain = zeros(numCombinations, dtype=float) varSizeStatisticsTest = zeros(numCombinations, dtype=float) a = 0 mnist = fetch_mldata('MNIST original') # split a training set and a test set y_train, y_test = mnist.target[:60000], mnist.target[60000:70000] #vectorizer = CountVectorizer(binary=True) X_both = mnist.data binarizer = Binarizer().fit(50,X_both) X_both = binarizer.transform(X_both) X_train = X_both[:60000] X_test = X_both[60000:70000] #print X_train[1] #ch2 = SelectKBest(chi2, 750) #X_train = ch2.fit_transform(X_train, y_train) #X_test = ch2.transform(X_test) data_train = X_train m,n = data_train.shape print m," ",n
X = (news_data * lasso_est.transpose()) # multiply element wise with lasso estimate df_Lasso = X[X.columns[(X != 0).any()]] # remove columns where all elements are zero print df_Lasso.shape # number of columns should significantly shrink depending on choice of alpha df_Lasso.columns.values.tolist() # In[104]: #obtain a split # from sklearn.cross_validation import train_test_split # X_train, X_test, y_train, y_test = train_test_split(df_Lasso, news_labels) #binarize from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=binary_threshold) binary_labels = binarizer.transform(news_labels).transpose().ravel() # .ravel() is to fix "Too many array indices error" print binary_labels.shape # In[107]: from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import cross_val_score knn = KNeighborsClassifier(n_neighbors=1) # arbitrary k cv = cross_val_score(knn, df_Lasso, binary_labels, cv=10) print "Cross Validation Scores" print cv print 'Mean Cross Validation Score' print np.mean(cv)
# binarization from sklearn.preprocessing import Binarizer import pandas import numpy url = "https://goo.gl/vhm1eU" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = pandas.read_csv(url, names=names) array = dataframe.values # separate array into input and output components X = array[:,0:8] Y = array[:,8] binarizer = Binarizer(threshold=0.0).fit(X) binaryX = binarizer.transform(X) # summarize transformed data numpy.set_printoptions(precision=3) print(binaryX[0:5,:])
# Comment section below out if you already have made pickle files # #--------------------------------------------------------------------------------------- all_bigr = ngram(X_train, 'bigram') #starting with all features print "Starting counting bigrams..." X_train_bi_counted = count(X_train, all_bigr, 'bigram') print "Done counting train set" X_test_bi_counted = count(X_test, all_bigr, 'bigram') print "Done counting test set" print "Binarizing and dumping files" bin = Binarizer() X_train_bi_binary = bin.fit_transform(X_train_bi_counted) X_test_bi_binary = bin.transform(X_test_bi_counted) pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) ) pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) ) print "Done" print "Starting tfidf vectors..." X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted) pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) ) pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) ) print "Done" print "Starting feature selection using CART random forests on binary files" indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary') pickle.dump(indices_important_feats_bi_bin, open( "indices_important_feats_bi_bin.p", "wb" ) )
class Binarizer(TransformerMixin): """ Реализует различные стратегии бинаризации признаков, вычисляя оптимальные пороги и производя бинаризацию с данными порогами Аргументы: ---------- method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков divide_to_bins: bool(optional, default=True), индикатор приведения количественных признаков к целочисленным bins_number: int(optional, default=10), число возможных значений целочисленных признаков при бинаризации """ _UNSUPERVISED_METHODS = ['random'] _SUPERVISED_METHODS = ['log_odds', 'bns'] _CONTINGENCY_METHODS = ['log_odds', 'bns'] def __init__(self, method, divide_to_bins=True, bins_number=10): self.method = method self.divide_to_bins = divide_to_bins self.bins_number = bins_number def fit(self, X, y=None): """ Обучает бинаризатор на данных """ # print("Fitting binarizer...") methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS if self.method not in methods: raise ValueError("Method should be one of {0}".format(", ".join(methods))) X = check_array(X, accept_sparse=['csr', 'csc']) if issparse(X): X = X.tocsc() if self.method in Binarizer._UNSUPERVISED_METHODS: self._fit_unsupervised(X) self.joint_thresholds_ = self.thresholds_ self.joint_scores_ = self.scores_ else: if y is None: raise ValueError("y must not be None for supervised binarizers.") # вынести в отдельную функцию # y = np.array(y) # if len(y.shape) == 1: # self.classes_, y = np.unique(y, return_inverse=True) # nclasses = self.classes_.shape[0] # Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int) # Y_new[np.arange(y.shape[0]), y] = 1 # else: # self.classes_ = np.arange(y.shape[1]) # Y_new = y label_binarizer = SK_LabelBinarizer() Y_new = label_binarizer.fit_transform(y) self.classes_ = label_binarizer.classes_ if X.shape[0] != Y_new.shape[0]: raise ValueError("X and y have incompatible shapes.\n" "X has %s samples, but y has %s." % (X.shape[0], Y_new.shape[0])) self._fit_supervised(X, Y_new) if len(self.classes_) <= 2: self.joint_thresholds_ = self.thresholds_[:, 0] self.joint_scores_ = self.scores_[:, 0] else: min_class_scores = np.min(self.scores_, axis=0) max_class_scores = np.max(self.scores_, axis=0) diffs = max_class_scores - min_class_scores diffs[np.where(diffs == 0)] = 1 normalized_scores = (self.scores_ - min_class_scores) / diffs # находим для каждого признака тот класс, для которого он наиболее полезен # НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ optimal_indexes = np.argmax(normalized_scores, axis=1) nfeat = self.thresholds_.shape[0] # в качестве порога бинаризации каждого признака # берём значение для класса, где он наиболее полезен self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes] self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes] # передаём пороги в sklearn.SK_Binarizer self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_) return self def transform(self, X): """ Применяем бинаризатор к данным """ print("Transforming binarizer...") if hasattr(self, 'binarize_transformer_'): return self.binarize_transformer_.transform(X) else: raise ValueError("Transformer is not fitted") def _fit_unsupervised(self, X): """ Управляющая функция для методов подбора порога без учителя """ if self.method == 'random': # случайные пороги и полезности if issparse(X): minimums = X.min(axis=0).toarray() maximums = X.max(axis=0).toarray() else: minimums = np.min(X, axis=0) maximums = np.max(X, axis=0) random_numbers = np.random.rand(X.shape[1], 1).reshape((X.shape[1],)) self.thresholds_ = minimums + (maximums - minimums) * random_numbers self.scores_ = np.random.rand(X.shape[1], 1).reshape((X.shape[1],)) return self def _fit_supervised(self, X, y): """ Выполняет подбор порогов с учителем """ # приводим X к целочисленным значениям, если нужно if self.divide_to_bins: bin_divider = BinDivider(bins_number=self.bins_number) X = bin_divider.fit_transform(X) thresholds, scores = [], [] for i in range(X.shape[1]): threshold, score = self._find_optimal_thresholds(X[:, i], y) thresholds.append(threshold) scores.append(score) self.thresholds_ = np.asarray(thresholds, dtype=np.float64) self.scores_ = np.asarray(scores, dtype=np.float64) return self def _find_optimal_thresholds(self, column, y): """ Вычисляет пороги для бинаризации Аргументы: ---------- column: array-like, shape=(nobj,), колонка значений признаков y: array-like, shape=(nobj, nclasses), 0/1-матрица классов """ classes_number = y.shape[1] # вычисляем частоты встречаемости признаков для разных классов values, counts = \ _collect_column_statistics(column, y, classes_number=classes_number, precision=6) if self.method in Binarizer._CONTINGENCY_METHODS: # бинарная классификация if classes_number <= 2: counts = [counts] else: summary_counts = np.sum(counts, axis=1) counts = [np.array((summary_counts - counts[:, i], counts[:, i])).T for i in np.arange(classes_number)] best_thresholds = [None] * len(counts) best_scores = [None] * len(counts) for i in np.arange(len(counts)): current_thresholds, current_tables = \ _collect_contingency_tables(values, counts[i]) if self.method == "log_odds": func = (lambda x: odds_ratio(x, alpha=0.1)) elif self.method == 'information_gain': func = information_gain elif self.method == 'bns': func = bns else: raise ValueError("Wrong binarization method: {0}".format(self.method)) scores = [func(table) for table in current_tables] best_score_index = np.argmax(scores) best_thresholds[i] = current_thresholds[best_score_index] best_scores[i] = scores[best_score_index] return best_thresholds, best_scores
def by_threshold(self, threshold=0.0): bin = Skbin(threshold).fit(self.M) return bin.transform(self.M)
print('Loading test data...') with open('data/test-svmlight.dat') as infile: lines = infile.readlines() n_samples = len(lines) test = lil_matrix((n_samples, n_features)) for n,line in enumerate(lines): for word_count in line.split(): fid, count = word_count.split(':') test[n,int(fid)] = int(fid) test = test.tocsr() if opts.binarize: print('Binarizing the data...') binar = Binarizer(copy=False) X = binar.transform(X) test = binar.transform(test) if opts.tfidf: print('Transforming word occurrences into TF-IDF...') tranny = TfidfTransformer() X = tranny.fit_transform(X) test = tranny.transform(test) if opts.select_features: k_features = int(opts.k_features) if opts.select_features == 'k-best': print('Selecting %i best features...' % k_features) ch2 = SelectKBest(chi2, k=k_features) if opts.select_features == 'pct': print('Selecting features in the top %i percentile...' % k_features)
news_labels = extracted_data[' shares'] # Take shares column for labels # Data Preprocessing news_data_transpose = news_data.transpose() data_into_dict = news_data_transpose.to_dict() list_data = [v for k, v in data_into_dict.iteritems()] # Encode from sklearn.feature_extraction import DictVectorizer dv = DictVectorizer() transformed_data = dv.fit_transform(list_data).toarray() # Label Encoder - Binarization from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=1400) # Threshold at 1400 because median of shares is 1400 transformed_labels = binarizer.transform(news_labels) transformed_labels = transformed_labels.transpose().ravel() # .ravel() is to fix "Too many array indices error" # Could be a scikit or pandas bug ############## Classification ################# from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression from sklearn.svm import SVC # Decision Tree Classifier tree = DecisionTreeClassifier() knn = KNeighborsClassifier() gnb = GaussianNB() # lr = LinearRegression()
hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) # step 1 only except NameError: head_node = each[0] hub_ego=nx.ego_graph(Gsim,head_node,radius = 1) index = hub_ego.nodes() # pec = random.uniform(0.5,0.8) # percentage of nodes selected between [0.5,0.8] pec = 0.8 random.shuffle(index) subidx = index[:int(pec*len(index))] Y = np.zeros(num_sample) Y[::5] += 3 * (0.5-np.random.rand(num_sample/5)) # add noise to targets for each in subidx[1:]: Y += np.power(data_mat[:,each],3) binarizer = Binarizer() label = binarizer.transform(Y) # output the gene expression matrix ofp = open('nonlinear2.'+str(i)+'.genemat','w') for each in sorted(Gsim.nodes()): print >> ofp, str(each)+'\t'+'\t'.join(map(str,data_mat[:,each])) print >> ofp, 'outcome\t'+'\t'.join(map(str,label)) ofp.close() #print 'significant network',index nx.write_adjlist(Gsim,"nonlinear2."+str(i)+".adjlist") os.system('epd_python svmnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.svm.'+str(i)+'.txt -s 0') # os.system('epd_python ../rfnet.py -n nonlinear2.adjlist -g nonlinear2.genemat -o nonlinear2.rf.txt -s 0 -r 20') os.system('epd_python knnnet.py -n nonlinear2.'+str(i)+'.adjlist -g nonlinear2.'+str(i)+'.genemat -o nonlinear2.knn.'+str(i)+'.txt -s 0') svm_count += count_net('nonlinear2.svm.'+str(i)+'.txt',index) #rf_count += count_net('nonlinear2.rf.txt',index) knn_count += count_net('nonlinear2.knn.'+str(i)+'.txt',index)
# In[3]: # Import csv data raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:] # read in csv, omit the first column of url raw_data = raw_data.iloc[:, :-1] news_data = raw_data.iloc[:, :-1] # Take up to the second last column news_labels = raw_data.iloc[:, -1] # Take shares column for labels # Binarize print '\nBinary Threshold:' binary_threshold = np.median(raw_data[' shares']) news_data = news_data.drop(' n_non_stop_words', 1) print binary_threshold binarizer = Binarizer(threshold=binary_threshold) y_binary = binarizer.transform(news_labels).transpose().ravel() # In[ ]: # Discretize # In[25]: # Decision Tree from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() print 'Decision Tree Classifier Accuracy Rate' tree_score = cross_val_score(tree, news_data, y_binary, cv=10) np.mean(tree_score)
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder onehot_encoder = OneHotEncoder() label_encoder = LabelEncoder() x = ['a', 'b', 'c'] label_x = label_encoder.fit_transform(x).reshape([len(x), 1]) print(label_x) print(onehot_encoder.fit_transform(label_x).toarray()) binarizer = Binarizer(threshold=1.0).fit(label_x) print(binarizer.transform(label_x))
X_tokens = tokenizer.transform(X_train) # Train Recurrent Neural Network model = train_RNN(tokenizer, X_tokens, y_train) y_pred_tr = model.predict(X_tokens).flatten() # Check overall performance test_tokens = tokenizer.transform(X_test) y_pred_tst = model.predict(test_tokens).flatten() # Conver predictions to binary yhat_train = y_pred_tr.reshape(-1, 1) yhat_test = y_pred_tst.reshape(-1, 1) binarizer = Binarizer(threshold=0.5).fit(yhat_train) yhat_tr_b = binarizer.transform(yhat_train).astype(int) yhat_tst_b = binarizer.transform(yhat_test).astype(int) save(model, review_score_full.pkl) with open('review_tokenizer_full.pkl', 'wb') as fileObject: pickle.dump(tokenizer, fileObject) # # Save model for future use # save(model, 'review_scorer1.pkl') # # model = load('review_scorer.pkl') # with open('review_tokenizer1.pkl','wb') as fileObject: # pickle.dump(tokenizer, fileObject) # Scorers to consider # score()
} #%% os.chdir(workspace) dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy")) val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy")) dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy")) val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy")) binarizer = Binarizer(copy=True, threshold=thresh) imputer = Imputer(copy = False) dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],)) val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],)) """ from statsmodels.regression import quantile_regression dev_idvs2 = dev_idvs[:10000,:] inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1] dev_dvs2 = dev_dvs[:10000,:].reshape((10000,)) model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2) model.fit() """ """ #plot(mae_dev)