def words_killer(self, train, data, method, words_num_names_kept=50): if method == 'nb': normalizer = Binarizer() normalized_data = pd.DataFrame(normalizer.fit_transform(data)) normalized_data.index = data.index train_data = pd.concat([normalized_data, train['label_class']], axis=1, join='inner') clf = BernoulliNB() clf.fit(train_data.drop('label_class', axis=1), train_data['label_class']) print( 'words killer auc: ', cross_val_score(clf, train_data.drop('label_class', axis=1), train_data['label_class'], scoring='roc_auc')) fe = pd.Series(clf.coef_[0]) fe.index = data.columns fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept] return data[fe.index] if method == 'pca': clf = PCA(n_components=words_num_names_kept) train_data = pd.DataFrame(clf.fit_transform(data)) train_data.index = data.index return train_data if method == 'lg': normalizer = MinMaxScaler() normalized_data = pd.DataFrame(normalizer.fit_transform(data)) normalized_data.index = data.index train_data = pd.concat([normalized_data, train['label_class']], axis=1, join='inner') clf = LogisticRegression(class_weight='balanced') clf.fit(train_data.drop('label_class', axis=1), train_data['label_class']) print( 'words killer auc: ', cross_val_score(clf, train_data.drop('label_class', axis=1), train_data['label_class'], scoring='roc_auc')) fe = pd.Series(clf.coef_[0]) fe.index = data.columns fe = fe.abs().sort_values(ascending=False)[:words_num_names_kept] return data[fe.index] else: return data
def convert_to_classification(X_train, X_test, y_train, y_test, threshold, num_features): #Discretize y values i.e. convert the target variable from a continuous value into categorical #Median target variable value used as threshold transformer = Binarizer(threshold) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) X_test = X_test.reshape(-1, num_features) X_train = X_train.reshape(-1, num_features) y_train_discretized = transformer.fit_transform(y_train) y_test_discretized = transformer.fit_transform(y_test) print(X_train.shape) print(X_test.shape) print(y_train_discretized.shape) print(y_test_discretized.shape) return X_train, X_test, y_train_discretized, y_test_discretized
def test_wine(): """Sample test on the Wine UCI dataset. Please do note this test is _not_ conclusive, but the zero class is so well-separated that all the variations should do well on this specific class """ wine = sklearn.datasets.load_wine() train_data = Normalizer().fit_transform(wine.data) nsample, nfeatures = train_data.shape bindata = np.zeros(train_data.shape) for i in range(nfeatures): binarizer = Binarizer(threshold=train_data[:, i].mean()) bindata[:, i] = binarizer.fit_transform(train_data[:, i] .reshape(-1, 1)).reshape(1, -1) model = BernoulliBayesianSet(bindata, meanfactor=2, alphaepsilon=0.0001, betaepsilon=0.0001) some_zero_class_indices = [0, 3, 5] ranking = np.argsort(model.query(some_zero_class_indices))[::-1] top10 = ranking[:10] truepositives = (wine.target[top10] == 0).sum() precision = truepositives / 10 # allows a single mistake assert precision >= 0.9
def test_logistic_regression_cv_serializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features='a', prediction_column='e_binary') extract_features = ['e'] feature_extractor = FeatureExtractor( input_scalars=['e'], output_vector='extracted_e_output', output_vector_items=["{}_out".format(x) for x in extract_features]) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(prior_tf=feature_extractor, output_features='e_binary') Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[['a']], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) self.assertEqual(model['op'], 'logistic_regression') self.assertTrue(model['attributes']['intercept']['double'] is not None)
def one_hot_vectorize_scikitLearn(corpus): freq = CountVectorizer() corpus = freq.fit_transform(corpus) onehot = Binarizer() vector = onehot.fit_transform(corpus.toarray()) return vector
def discretization(df, label_name): df_Ent = {} baseEntropy = calcShannonEnt(df) print('基本信息增益是: %f' % baseEntropy) predictor = [x for x in df.columns if x != label_name] for x in predictor: bestInfoGain = 0.0 df_Ent[x] = {} for row in range(len(df) - 1): newEntropy = 0.0 sort_df = df[[x, label_name]].sort_values(by=x, ascending=True) if sort_df.iloc[row, 0] == sort_df.iloc[row + 1, 0]: continue split_point = (sort_df.iloc[row, 0] + sort_df.iloc[row + 1, 0]) / 2 bin_encoder = Binarizer(split_point) sort_df[x] = bin_encoder.fit_transform( sort_df[x].values.reshape(-1, 1)) for value in [0, 1]: subdataset = sort_df[sort_df[x] == value] prob = len(subdataset) / float(len(sort_df)) newEntropy += prob * calcShannonEnt(subdataset) infoGain = baseEntropy - newEntropy if infoGain > bestInfoGain: df_Ent[x]['best_point'] = split_point df_Ent[x]['Ent'] = infoGain bestInfoGain = infoGain print('%s的最佳划分点是 %f, 最大信息增益是 %f。' % (x, df_Ent[x]['best_point'], df_Ent[x]['Ent'])) return df_Ent
def preprocess(data): """Function to preprocess data using steps that were used during model building""" df = pd.read_json(data) #seperate dataframe into numerica and categorical variables #for ease of handling anf processing numeric_vars = df.select_dtypes(exclude = ('category','object')) cat_vars = df.select_dtypes(include = ('category','object')) #scale numeric variables scaler = StandardScaler() scaled_numeric_vars = scaler.fit_transform(numeric_vars) dummy_vars = pd.get_dummies(data = cat_vars) binarize = Binarizer() scaled_numeric_vars['capital_gain'] = binarize.fit_transform(scaled_numeric_vars['capital_gain'].values.reshape(-1,1)) scaled_numeric_vars['capital_loss'] = binarize.fit_transform(scaled_numeric_vars['capital_loss'].values.reshape(-1,1)) #get dummy variables of categorical data dummy_vars = pd.get_dummies(data = cat_vars) ## merge dataframe new_df = pd.merge(left=scaled_numeric_vars, right=dummy_vars, left_index=True, right_index=True) #select features used to train model transformed_df = column_data.transform(new_df) return transformed_df
def binarization(features, threshold = 0.0, is_copy = True): """ DONE 数值特征二值化 """ bined = Binarizer(threshold = threshold, copy = is_copy) transformed_data = bined.fit_transform(features) return transformed_data
def getTrigramEncoding(text_array): freq = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb') # trigram corpus_trigrams = freq.fit_transform(text_array) onehot = Binarizer() corpus_trigrams_one_hot = onehot.fit_transform(corpus_trigrams.toarray()) return freq, corpus_trigrams_one_hot
def binarizeMatrix(dataMatrix, threshold): """ Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1] """ binarizer = Binarizer(threshold=threshold) dataMatrix = binarizer.fit_transform(dataMatrix) return dataMatrix
def numeric2binary_preprocessor(df, binary_cols, threshold=0.0): binarized_cols = {} for col in binary_cols: binarizer = Binarizer(threshold=threshold, copy=False) df[col + "_binary"] = pd.DataFrame(binarizer.fit_transform(df[[col]]), index=df.index) df.drop(col, axis=1, inplace=True) binarized_cols[col] = binarizer return binarized_cols
def getPresenceFeatures(data): vectorizer = CountVectorizer( analyzer='word', lowercase=False, ) features2 = vectorizer.fit_transform(data) #.toarray() #Unigram features bin = Binarizer() presenceFeatures = bin.fit_transform(features2) return presenceFeatures, vectorizer
def preprocess(logit, label): logit = toone(logit) if logit[-1][Data.intentdict[1]['none']] > 0.5 and label[-1][ Data.intentdict[1]['none']] > 0.5: logit[-1][Data.intentdict[1]['none']] = int(0) label[-1][Data.intentdict[1]['none']] = int(0) bin = Binarizer(threshold=0.2) logit = bin.fit_transform([logit[-1]]) label = bin.fit_transform([label[-1]]) return logit[-1], label[-1]
def getTrigramFeatures(data): vectorizer = CountVectorizer( analyzer='word', lowercase=False, ngram_range=(1, 3), ) features = vectorizer.fit_transform(data) bin = Binarizer() tgFt = bin.fit_transform(features) return tgFt, vectorizer
def BinOutcome(dataset): combatPts = [] for poke in dataset: combatPts.append(poke.ptOut) meanPtOut = np.mean(combatPts) combatPts = np.array(combatPts) combatPts = combatPts.reshape(1, -1) binarizerP = Binarizer(threshold=meanPtOut) return binarizerP.fit_transform(combatPts)
def preprocess(logits, labels): logits = toone(logits) bin = Binarizer(threshold=0.2) for i in range(len(logits)): if logits[i][Data.intentdict[1]['none']] > 0.5 and labels[i][ Data.intentdict[1]['none']] > 0.5: logits[i][Data.intentdict[1]['none']] = int(0) labels[i][Data.intentdict[1]['none']] = int(0) logits = bin.fit_transform(logits) labels = bin.fit_transform(labels) return logits.flatten(), labels.flatten()
def __labelBinarizer(self,threshold): """---labelBinarizer----------------------------------------- Values greater than the threshold map to 1, while values less than or equal to the threshold map to 0. ---Parameters threshold : float ---Return None ------------------------------------------------------""" from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=threshold) self.__yData = binarizer.fit_transform(self.__yData)
def calculate_score(predict_output, ground_truth, talker=None): test_talker = open('Data/test/talker', 'r').readlines() ret_pred_outputs = list() ret_ground_truth = list() talker_cnt = -1 for pred, label in zip(predict_output, ground_truth): talker_cnt += 1 if len(test_talker) <= talker_cnt: talker_cnt = len(test_talker) - 1 if test_talker[talker_cnt].strip('\n') != talker and talker != 'ALL': continue pred_act = pred[:5] # first 5 is act pred_attribute = pred[5:] # remaining is attribute binary = Binarizer(threshold=0.5) act_logit = one_hot(np.argmax(pred_act), "act") attribute_logit = binary.fit_transform([pred_attribute]) if np.sum(attribute_logit) == 0: attribute_logit = one_hot(np.argmax(pred_attribute), "attribute") label = binary.fit_transform([label]) ret_pred_outputs = np.append(ret_pred_outputs, np.append(act_logit, attribute_logit)) ret_ground_truth = np.append(ret_ground_truth, label) return ret_pred_outputs, ret_ground_truth
def sklearn_one_hot_vectorize(corpus): # The Sklearn one hot vectorize method from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import Binarizer freq = CountVectorizer() vectors = freq.fit_transform(corpus) print(len(vectors.toarray()[0])) onehot = Binarizer() vectors = onehot.fit_transform(vectors.toarray()) print(len(vectors[0]))
def wine_quality_white(): # white wine quality dataset filename = '../../data/raw/mldata/winequality-white.csv' # The data corresponds to the 11 first column of the csv file data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float) # Read the label # We need to binarise the label using a threshold at 4 bn = Binarizer(threshold=4) label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int)) # We need to inverse the label -> 1=0 and 0=1 label = np.ravel(np.abs(label - 1)) np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
def do_logreg(): from sklearn.preprocessing import Binarizer, scale from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV from scipy.stats import expon import pandas ### load data col_names = [ 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name' ] df = pandas.read_csv('auto_mpg.csv') df.columns = col_names df = df.drop('car_name', 1) lr = LogisticRegression() bn = Binarizer(threshold=df['mpg'].mean()) print "Performing binarization of the mpg variable into above/below average classes" target = bn.fit_transform(df['mpg']) data = df.drop('mpg', 1) data = scale(data) print "Splitting into training and test sets" data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=0.5, random_state=0) grid = [0.001, 0.01, 0.1, 1, 10, 100, 1000] print 'Searching for optimal C in {} using {}-fold validation on test set '.format( grid, nfolds) tuned_parameters = [{'C': grid}] clf = GridSearchCV(lr, tuned_parameters, cv=nfolds, scoring='accuracy') clf.fit(data_train, target_train) for params, mean_score, _ in clf.grid_scores_: print "{}: Mean accuracy {}".format(params, mean_score) print """Cross-validating above/below average mpg prediction using {}-fold validation on the test dataset. Using the best estimator: {} """.format(nfolds, clf.best_estimator_) mean_cross = np.mean( cross_val_score(clf.best_estimator_, data_test, target_test, cv=nfolds)) print "Mean cross-validated accuracy after optimization is: {}".format( mean_cross)
def intentpreprocess(logit, label): logit = transform_to_onehot(logit) #1xint #because sometimes attribute won't have any, so i establish a none option for model to determine weather to #choose a valid or not #but a none choose can't be counted as a label #this indicate that if both none, then good #but if label is none and logit is not, then will be negative positive if logit[-1][Data.intentdict[1]['none']] > 0.5 and label[-1][ Data.intentdict[1]['none']] > 0.5: logit[-1][Data.intentdict[1]['none']] = int(0) label[-1][Data.intentdict[1]['none']] = int(0) else: label[-1][Data.intentdict[1]['none']] = 0 bin = Binarizer(threshold=0.2) logit = bin.fit_transform(logit) label = bin.fit_transform(label) return logit[-1], label[-1]
def main(): datasets = gen_datasets() print "origin data:" print datasets #0均值,单位方差 standard_scaler = StandardScaler() scaler_datasets = standard_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 min_max_scaler = MinMaxScaler() scaler_datasets = min_max_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 max_abs_scaler = MaxAbsScaler() scaler_datasets = max_abs_scaler.fit_transform(datasets) print scaler_datasets print "-" * 80 normalize = Normalizer(norm="l1") normalize_datasets = normalize.fit_transform(datasets) print normalize_datasets print "-" * 80 binarizer = Binarizer(threshold=1.1) binarizer_datasets = binarizer.fit_transform(datasets) print binarizer_datasets print "-" * 80 one_hot_encoder = OneHotEncoder() one_hot_encoder_datasets = one_hot_encoder.fit_transform([[0, 1, 4], [1, 2, 0], [2, 3, 5]]) print one_hot_encoder_datasets.toarray() print "-" * 80 imputer = Imputer(missing_values=0, strategy="median") imputer_datasets = imputer.fit_transform(datasets) print imputer_datasets print imputer.statistics_
def do_logreg(): from sklearn.preprocessing import Binarizer, scale from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score,classification_report from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score from sklearn.grid_search import GridSearchCV from scipy.stats import expon import pandas ### load data col_names=['mpg','cylinders','displacement','horsepower','weight', 'acceleration','model_year','origin','car_name'] df=pandas.read_csv('auto_mpg.csv') df.columns=col_names df=df.drop('car_name',1) lr=LogisticRegression() bn=Binarizer(threshold=df['mpg'].mean()) print "Performing binarization of the mpg variable into above/below average classes" target=bn.fit_transform(df['mpg']) data=df.drop('mpg',1) data=scale(data) print "Splitting into training and test sets" data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0) grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000] print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds) tuned_parameters=[{'C':grid}] clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy') clf.fit(data_train,target_train) for params, mean_score,_ in clf.grid_scores_: print "{}: Mean accuracy {}".format(params,mean_score) print """Cross-validating above/below average mpg prediction using {}-fold validation on the test dataset. Using the best estimator: {} """.format(nfolds,clf.best_estimator_) mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds)) print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
def us_crime(): # US crime dataset filename = '../../data/raw/mldata/communities.data' # The missing data will be consider as NaN # Only use 122 continuous features tmp_data = np.genfromtxt(filename, delimiter = ',') tmp_data = tmp_data[:, 5:] # replace missing value by the mean imp = Imputer(verbose = 1) tmp_data = imp.fit_transform(tmp_data) # extract the data to be saved data = tmp_data[:, :-1] bn = Binarizer(threshold=0.65) label = np.ravel(bn.fit_transform(tmp_data[:, -1])) np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
def test_logistic_regression_cv_deserializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features='a', prediction_column='e_binary') extract_features = ['e'] feature_extractor = FeatureExtractor( input_scalars=['e'], output_vector='extracted_e_output', output_vector_items=["{}_out".format(x) for x in extract_features]) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(prior_tf=feature_extractor, output_features='e_binary') Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[['a']], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) # Now deserialize it back node_name = "{}.node".format(logistic_regression.name) logistic_regression_tf = LogisticRegressionCV() logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle( self.tmp_dir, node_name) res_a = logistic_regression.predict(self.df[['a']]) res_b = logistic_regression_tf.predict(self.df[['a']]) self.assertEqual(res_a[0], res_b[0]) self.assertEqual(res_a[1], res_b[1]) self.assertEqual(res_a[2], res_b[2])
def main(): raw_datasets, _ = Datasets.load_datasets() X, Y = gen_datasets(raw_datasets) vectorizer = CountVectorizer(decode_error="ignore") cv_datasets = vectorizer.fit_transform(X).toarray() clf = ExtraTreesClassifier() clf = clf.fit(cv_datasets, Y) print cv_datasets.shape print clf.feature_importances_ modle = SelectFromModel(clf, prefit=True) X_new = modle.transform(cv_datasets) print X_new.shape binarizer = Binarizer(threshold=1.0) b_datasets = binarizer.fit_transform(cv_datasets) variance_threshold = VarianceThreshold(.8 * (1 - .8)) v_datasets = variance_threshold.fit_transform(b_datasets) print v_datasets.shape
def test_logistic_regression_cv_serializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features=['a', 'b', 'c'], prediction_column=['e_binary']) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(input_features=['e'], output_features=['e_binary']) Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[logistic_regression.input_features], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) self.assertEqual(model['op'], 'logistic_regression') self.assertTrue(model['attributes']['intercept']['value'] is not None)
def test_logistic_regression_cv_deserializer(self): logistic_regression = LogisticRegressionCV(fit_intercept=True) logistic_regression.mlinit(input_features=['a', 'b', 'c'], prediction_column=['e_binary']) binarizer = Binarizer(threshold=0.0) binarizer.mlinit(input_features=['e'], output_features=['e_binary']) Xres = binarizer.fit_transform(self.df[['a']]) logistic_regression.fit(self.df[logistic_regression.input_features], Xres) logistic_regression.serialize_to_bundle(self.tmp_dir, logistic_regression.name) # Test model.json with open("{}/{}.node/model.json".format( self.tmp_dir, logistic_regression.name)) as json_data: model = json.load(json_data) # Now deserialize it back node_name = "{}.node".format(logistic_regression.name) logistic_regression_tf = LogisticRegressionCV() logistic_regression_tf = logistic_regression_tf.deserialize_from_bundle( self.tmp_dir, node_name) res_a = logistic_regression.predict( self.df[logistic_regression.input_features]) res_b = logistic_regression_tf.predict( self.df[logistic_regression_tf.input_features]) self.assertEqual(res_a[0], res_b[0]) self.assertEqual(res_a[1], res_b[1]) self.assertEqual(res_a[2], res_b[2])
def sparsityMeasure(loadPath, prefix): X, y = static_load_csr(loadPath) X_pos = X[y == 1,:] X_neg = X[y == 0,:] mean_traffic_pos = np.sum( np.sum(X_pos, axis = 1) ) *1.0/ X_pos.shape[0] mean_traffic_neg = np.sum( np.sum(X_neg, axis = 1) ) *1.0/ X_neg.shape[0] binarizer = Binarizer() X_pos = binarizer.fit_transform(X_pos) X_neg = binarizer.fit_transform(X_neg) mean_domains_pos = np.sum( np.sum(X_pos, axis = 1) ) *1.0/ X_pos.shape[0] mean_domains_neg = np.sum( np.sum(X_neg, axis = 1) ) *1.0/ X_neg.shape[0] print 'mean_traffic_pos : ' + str(mean_traffic_pos) print 'mean_traffic_neg : ' + str(mean_traffic_neg) print 'mean_domains_pos : ' + str(mean_domains_pos) print 'mean_domains_neg : ' + str(mean_domains_neg) overall_traffic = (mean_traffic_pos * X_pos.shape[0] + mean_traffic_neg * X_neg.shape[0])*1.0/ X.shape[0] overall_domains = (mean_domains_pos * X_pos.shape[0] + mean_domains_neg * X_neg.shape[0])*1.0/ X.shape[0] print ' overall_traffic : ' + str( overall_traffic) print 'overall_domains : ' + str(overall_domains)
def data_process(data, process_type): # 特征处理 if process_type == "Binary": # 二值化处理 processmodule = Binarizer(copy=True, threshold=0.0) # 大于 threshold 的映射为1, 小于 threshold 的映射为0 elif process_type == "MinMax": # 归一化处理 processmodule = MinMaxScaler(feature_range=(0, 1), copy=True) elif process_type == "Stand": # 标准化处理 processmodule = StandardScaler(copy=True, with_mean=True, with_std=True) elif process_type == "Normal": processmodule = Normalizer(copy=True, norm="l2") elif process_type == "MultiLabelBinar": # 多标签2值话处理 processmodule = MultiLabelBinarizer(sparse_output=True) else: raise ValueError("please select a correct process_type") result = processmodule.fit_transform(data) return result
# Load libraries from sklearn.preprocessing import Binarizer import numpy as np # Create feature age = np.array([[6], [12], [20], [36], [65]]) # Create binarizer binarizer = Binarizer(18) # Transform feature bn = binarizer.fit_transform(age) print(bn)
d_tokens = file[10] d_values = file[11] from sklearn.feature_extraction.text import CountVectorizer vectorizerToken = CountVectorizer(input='filename') vectors = vectorizerToken.fit_transform(d_tokens) print(len(vectors.toarray())) res_1 = vectors.toarray() from sklearn.preprocessing import Binarizer onehot = Binarizer() corpus = onehot.fit_transform(vectors.toarray()) res_2 = corpus from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(input='filename') vec_tf_idf = tfidf.fit_transform(d_tokens) res_3 = vec_tf_idf.toarray() feature_names = vectorizerToken.get_feature_names() res_1_names = ["count_" + feature for feature in feature_names] res_2_names = ["bin_" + feature for feature in feature_names] res_3_names = ["tfidf_" + feature for feature in feature_names]
# Transform -1 in 0 and take spin up as standard configuration binarizer = Binarizer(threshold=0) keys = list(datah5.keys()) # put here the temperature from keys that you want to use for the training #class_names = [keys[i] for i in [4, 6, 7, 8, 9, 10, 11, 12, 16]] class_names = [keys[i] for i in [4, 10, 16]] n_samples = datah5[keys[0]].shape[0] datah5_norm = {} data_bin = {} for key in keys: datah5_norm[key] = np.array([ np.where(np.sum(slice) < 0, -slice, slice) for slice in datah5[key] ]) data_bin[key] = np.array( [binarizer.fit_transform(slice) for slice in datah5_norm[key]]) # class labels even if they are not really useful here class_labels = np.asarray( list( itertools.chain.from_iterable( itertools.repeat(x, n_samples) for x in range(0, len(class_names))))) one_hot_labels = np.zeros((len(class_labels), len(class_names))) one_hot_labels[np.arange(len(class_labels)), class_labels] = 1 data = data_bin[class_names[0]] for temperature in class_names[1:]: data = np.concatenate([data, data_bin[temperature]]) radii = [0, 1, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16]
# # Comment section below out if you already have made pickle files # #--------------------------------------------------------------------------------------- all_bigr = ngram(X_train, 'bigram') #starting with all features print "Starting counting bigrams..." X_train_bi_counted = count(X_train, all_bigr, 'bigram') print "Done counting train set" X_test_bi_counted = count(X_test, all_bigr, 'bigram') print "Done counting test set" print "Binarizing and dumping files" bin = Binarizer() X_train_bi_binary = bin.fit_transform(X_train_bi_counted) X_test_bi_binary = bin.transform(X_test_bi_counted) pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) ) pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) ) print "Done" print "Starting tfidf vectors..." X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted) pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) ) pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) ) print "Done" print "Starting feature selection using CART random forests on binary files" indices_important_feats_bi_bin = tree(X_train_bi_binary, y_train, all_bigr, 'Bigram_binary')
def binarize(img, threshold): binarizer = Binarizer(threshold, copy=False) return binarizer.fit_transform(img)
def fp_vectorizer(self, processed_data): binarizer = Binarizer(threshold = 5) vectorized_data = binarizer.fit_transform(processed_data) return vectorized_data
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0, samples_on='rows', **kwargs): """Load a specified dataset. This function can be used either to load one of the standard scikit-learn datasets or a different dataset saved as X.npy Y.npy in the working directory. Parameters ----------- opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons', 'custom', 'GSEXXXXX'}, default: 'custom' Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes' 'boston', 'circles' and 'moons' refer to the correspondent `scikit-learn` datasets. 'custom' can be used to load a custom dataset which name is specified in `x_filename` and `y_filename` (optional). x_filename : string, default : None The data matrix file name. y_filename : string, default : None The label vector file name. n_samples : int The number of samples to be loaded. This comes handy when dealing with large datasets. When n_samples is less than the actual size of the dataset this function performs a random subsampling that is stratified w.r.t. the labels (if provided). samples_on : string This can be either in ['row', 'rows'] if the samples lie on the row of the input data matrix, or viceversa in ['col', 'cols'] the other way around. data_sep : string The data separator. For instance comma, tab, blank space, etc. Returns ----------- X : array of float, shape : n_samples x n_features The input data matrix. y : array of float, shape : n_samples The label vector; np.nan if missing. feature_names : array of integers (or strings), shape : n_features The feature names; a range of number if missing. index : list of integers (or strings) This is the samples identifier, if provided as first column (or row) of of the input file. Otherwise it is just an incremental range of size n_samples. """ data = None try: if opt.lower() == 'iris': data = datasets.load_iris() elif opt.lower() == 'digits': data = datasets.load_digits() elif opt.lower() == 'diabetes': data = datasets.load_diabetes() b = Binarizer(threshold=np.mean(data.target)) data.target = b.fit_transform(data.data) elif opt.lower() == 'boston': data = datasets.load_boston() b = Binarizer(threshold=np.mean(data.target)) data.target = b.fit_transform(data.data) elif opt.lower() == 'gauss': means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]]) sigmas = np.array([0.33, 0.33, 0.33]) if n_samples <= 1: n_samples = 333 xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'circles': if n_samples == 0: n_samples = 400 xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3, noise=.05) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'moons': if n_samples == 0: n_samples = 400 xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01) data = datasets.base.Bunch(data=xx, target=yy) elif opt.lower() == 'custom': data = load_custom(x_filename, y_filename, samples_on, **kwargs) elif opt.lower().startswith('gse'): raise Exception("Use ade_GEO2csv.py to convert GEO DataSets" "into csv files.") except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) X, y = data.data, data.target if n_samples > 0 and X.shape[0] > n_samples: if y is not None: try: # Legacy for sklearn sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1) # idx = np.random.permutation(X.shape[0])[:n_samples] except TypeError: sss = StratifiedShuffleSplit(test_size=n_samples) \ .split(X, y) _, idx = list(sss)[0] else: idx = np.arange(X.shape[0]) np.random.shuffle(idx) idx = idx[:n_samples] X, y = X[idx, :], y[idx] else: # The length of index must be consistent with the number of samples idx = np.arange(X.shape[0]) feat_names = data.feature_names if hasattr(data, 'feature_names') \ else np.arange(X.shape[1]) index = np.array(data.index)[idx] if hasattr(data, 'index') \ else np.arange(X.shape[0]) return X, y, feat_names, index