) clf = neighbors.KNeighborsClassifier(n_neighbors=1) clf.fit(X, y) ############################################################################ # You can also ask for meta-data to automatically preprocess the data. # # * e.g. categorical features -> do feature encoding dataset = openml.datasets.get_dataset(17) X, y, categorical_indicator, attribute_names = dataset.get_data( dataset_format='array', target=dataset.default_target_attribute ) print("Categorical features: {}".format(categorical_indicator)) transformer = compose.ColumnTransformer( [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)]) X = transformer.fit_transform(X) clf.fit(X, y) ############################################################################ # Runs: Easily explore models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We can run (many) scikit-learn algorithms on (many) OpenML tasks. # Get a task task = openml.tasks.get_task(403) # Build any classifier or pipeline clf = tree.ExtraTreeClassifier() # Run the flow
X[pos:pos + len(temptrain), ] = temptrain y[pos:pos + len(temptrain), ] = tempytrain pos += len(temptrain) X[pos:pos + len(tempvalid), ] = tempvalid y[pos:pos + len(tempvalid), ] = tempyvalid pos += len(tempvalid) X[pos:pos + len(temptest), ] = temptest y[pos:pos + len(temptest), ] = tempytest pos += len(temptest) select = (y[:, 0] == 1) | (y[:, 4] == 1) X = X[select, :] y = y[select, :] y = np.argmax(y, axis=1) y[y == 4] = 1 encoder = preprocessing.OneHotEncoder(n_values=2) y = encoder.fit_transform(np.reshape(y, (len(y), 1))).toarray() with h5py.File(hdf5_file, 'w') as f: X_train = f.create_dataset("X_train", (1000, width), compression="gzip") X_valid = f.create_dataset("X_valid", (100, width), compression="gzip") X_test = f.create_dataset("X_test", (100, width), compression="gzip") y_train = f.create_dataset("y_train", (1000, 2), compression="gzip") y_valid = f.create_dataset("y_valid", (100, 2), compression="gzip") y_test = f.create_dataset("y_test", (100, 2), compression="gzip") X_train[:, ] = X[:1000, :] X_valid[:, ] = X[1000:1100, :] X_test[:, ] = X[1100:1200, :] y_train[:, ] = y[:1000, :] y_valid[:, ] = y[1000:1100, :]
def load(self): """ Load this dataset into an undirected heterogeneous graph, downloading it if required. The graph has two types of nodes (``user`` and ``movie``) and one type of edge (``rating``). The dataset includes some node features on both users and movies: on users, they consist of categorical features (``gender`` and ``job``) which are one-hot encoded into binary features, and an ``age`` feature that is scaled to have mean = 0 and standard deviation = 1. Returns: A tuple where the first element is a :class:`StellarGraph` instance containing the graph data and features, and the second element is a pandas DataFrame of edges, with columns ``user_id``, ``movie_id`` and ``rating`` (a label from 1 to 5). """ self.download() ratings, users, movies, *_ = [ self._resolve_path(path) for path in self.expected_files ] edges = pd.read_csv( ratings, sep="\t", header=None, names=["user_id", "movie_id", "rating", "timestamp"], usecols=["user_id", "movie_id", "rating"], ) users = pd.read_csv( users, sep="|", header=None, names=["user_id", "age", "gender", "job", "zipcode"], usecols=["user_id", "age", "gender", "job"], ) movie_columns = [ "movie_id", "title", "release_date", "video_release_date", "imdb_url", # features from here: "unknown", "action", "adventure", "animation", "childrens", "comedy", "crime", "documentary", "drama", "fantasy", "film_noir", "horror", "musical", "mystery", "romance", "sci_fi", "thriller", "war", "western", ] movies = pd.read_csv( movies, sep="|", header=None, names=movie_columns, usecols=["movie_id"] + movie_columns[5:], ) # manage the IDs def u(users): return "u_" + users.astype(str) def m(movies): return "m_" + movies.astype(str) users_ids = u(users["user_id"]) movies["movie_id"] = m(movies["movie_id"]) movies.set_index("movie_id", inplace=True) edges["user_id"] = u(edges["user_id"]) edges["movie_id"] = m(edges["movie_id"]) # convert categorical user features to numeric, and normalize age feature_encoding = preprocessing.OneHotEncoder(sparse=False) onehot = feature_encoding.fit_transform(users[["gender", "job"]]) scaled_age = preprocessing.scale(users["age"]) encoded_users = pd.DataFrame( onehot, index=users_ids).assign(scaled_age=scaled_age) g = StellarGraph( { "user": encoded_users, "movie": movies }, {"rating": edges[["user_id", "movie_id"]]}, source_column="user_id", target_column="movie_id", ) return g, edges
def main(): """ Fit models and make predictions. We'll use one-hot encoding to transform our categorical features into binary features. y and X will be numpy array objects. """ filename="main_logit_3way" # nam prefix model = LogisticRegression(C=0.7, penalty="l2") # the classifier we'll use # === load data in memory === # print "loading data" y, X = load_data('train.csv') y_test, X_test = load_data('test.csv', use_labels=False) X,X_test= Make_3way(X, X_test)# add interractions # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) # if you want to create new features, you'll need to compute them # before the encoding, and append them to your dataset after #create arrays to hold cv an dtest predictions train_stacker=[ 0.0 for k in range (0,(X.shape[0])) ] # === training & metrics === # mean_auc = 0.0 bagging=1 # number of models trained with different seeds n = 5 # number of folds in strattified cv kfolder=StratifiedKFold(y, n_folds= n,shuffle=True, random_state=SEED) i=0 for train_index, test_index in kfolder: # for each train and test pair of indices in the kfolder object # creaning and validation sets X_train, X_cv = X[train_index], X[test_index] y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index] #print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) )) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions preds=bagged_set(X_train,y_train,model, SEED , bagging, X_cv, update_seed=True) # compute AUC metric for this CV fold roc_auc = roc_auc_score(y_cv, preds) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc no=0 for real_index in test_index: train_stacker[real_index]=(preds[no]) no+=1 i+=1 mean_auc/=n print (" Average AUC: %f" % (mean_auc) ) print (" printing train datasets ") printfilcsve(np.array(train_stacker), filename + ".train.csv") # === Predictions === # # When making predictions, retrain the model on the whole training set preds=bagged_set(X, y,model, SEED, bagging, X_test, update_seed=True) #create submission file printfilcsve(np.array(preds), filename+ ".test.csv")
from sklearn.externals import joblib # Importing dataset df = pd.read_csv('Churn_Modelling.csv') X = df.iloc[:, 3:13].values y = df.iloc[:, 13].values # Encoding categorical data #Encoding gender: le_gender = preprocessing.LabelEncoder() X[:, 2] = le_gender.fit_transform(X[:, 2]) # Encoding country: use one-hot encoding to avoid nonsensical averages le_country = preprocessing.LabelEncoder() X[:, 1] = le_country.fit_transform(X[:, 1]) ohe_country = preprocessing.OneHotEncoder(categorical_features=[1]) X = ohe_country.fit_transform(X).toarray() X = X[:, 1:] # Splitting dataset to train and test X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=0) # Feature scaling sc = preprocessing.StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) scaler_filename = input('*Enter filename for scaler to be saved: ') + '.bin' joblib.dump(sc, open(scaler_filename, 'wb')) # Training the ANN
def __init__(self, survey='OGLE3', band='I', use_time=True, use_err=True, norm=True, folded=True, machine='Jorges-MBP', seq_len=600, phy_params='', subsample=False): """ Parameters ---------- survey : str Name of survey to be used (only OGLE3 available for now) band : str Name of passband for a given survey name (OGLE3 uses I-band light curves for now) use_time : bool, optional return light curves with time or not use_err : bool, optional return light curves with error measurements or not norm : bool, optional normalize light curves or not folded : bool, optional use folded light curves or not machine : bool, optional which machine is been used (colab, exalearn, local) seq_len : bool, optional length of the light curves to be used phy_params : bool, optional which physical parameters will be provided with the loader subsample : bool, optional wheather to subsample the entire dataset """ if machine == 'Jorges-MBP': root = local_root elif machine == 'colab': root = colab_root elif machine == 'exalearn': root = exalearn_root else: print('Wrong machine, please select loca, colab or exalearn') sys.exit() if not folded: data_path = ('%s/time_series/real' % (root) + '/%s_lcs_%s_meta_snr5_augmented_trim%i.pkl' % (survey, band, seq_len)) else: data_path = ( '%s/time_series/real' % (root) + '/%s_lcs_%s_meta_snr5_augmented_folded_trim%i.npy.gz' % (survey, band, seq_len)) print('Loading from:\n', data_path) with gzip.open(data_path, 'rb') as f: self.aux = np.load(f, allow_pickle=True) self.lcs = self.aux.item()['lcs'] self.meta = self.aux.item()['meta'] del self.aux if subsample: idx = np.random.randint(0, self.lcs.shape[0], 20000) self.lcs = self.lcs[idx] self.meta = self.meta.iloc[idx].reset_index(drop=True) self.labels = self.meta['Type'].values ## integer encoding of labels self.label_int_enc = preprocessing.LabelEncoder() self.label_int_enc.fit(self.labels) self.labels_int = self.label_int_enc.transform(self.labels) ## one-hot encoding of labels self.label_onehot_enc = preprocessing.OneHotEncoder(sparse=False, categories='auto', dtype=np.float32) self.label_onehot_enc.fit(self.labels.reshape(-1, 1)) self.labels_onehot = self.label_onehot_enc.transform( self.labels.reshape(-1, 1)) if use_time and not use_err: self.lcs = self.lcs[:, :, 0:2] if not use_time and not use_err: self.lcs = self.lcs[:, :, 1:2] if not 'folded' in data_path: self.lcs = return_dt(self.lcs) if norm: self.lcs = normalize_each(self.lcs, n_feat=self.lcs.shape[2], scale_to=[.0001, .9999], norm_time=use_time) self.phy_names = [] if len(phy_params) > 0: if 'p' in phy_params or 'P' in phy_params: self.phy_names.append('Period') if 't' in phy_params or 'T' in phy_params: self.phy_names.append('teff_val') if 'm' in phy_params or 'M' in phy_params: self.phy_names.append('[Fe/H]_J95') if 'c' in phy_params or 'C' in phy_params: self.phy_names.append('bp_rp') if 'a' in phy_params or 'A' in phy_params: self.phy_names.append('abs_Gmag') if 'r' in phy_params or 'R' in phy_params: self.phy_names.append('radius_val') if 'l' in phy_params or 'L' in phy_params: self.phy_names.append('lum_val') self.phy_aux = self.phy_names else: self.phy_aux = ['Period'] self.mm_scaler = preprocessing.MinMaxScaler() self.mm_scaler.fit(self.meta.loc[:, self.phy_aux].values.astype( np.float32)) self.meta_p = self.mm_scaler.transform( self.meta.loc[:, self.phy_aux].values.astype(np.float32))
def pre_processing_train(train_data, test_data): X_train = train_data.loc[:, train_data.columns != 'SalePrice'] X_test = test_data.loc[:, test_data.columns != 'SalePrice'] # In[9]: X_combined = X_train.append(X_test, ignore_index=True) X_combined.shape # In[10]: def nulls(X): null_train = X.isnull().sum() null_train = null_train[null_train > 0] return null_train # In[11]: null_combined = nulls(X_combined) # In[12]: def dropColumns(X, nulls): for i in np.arange(len(nulls)): if nulls.values[i] > .5 * len(X): X = X.drop([nulls.index[i]], axis=1, inplace=False) return X # In[13]: X_combined = dropColumns(X_combined, null_combined) null_combined = nulls(X_combined) # In[14]: def impute(X, nulls): for i in nulls.index: # print(str(data[i].dtype.name) + " " + str(i)) # impute_mode = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0) # impute_mean = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) if X[i].nunique() < 50: X[i] = X[i].fillna(X[i].mode()[0]) else: X[i] = X[i].fillna(X[i].mean()) return X # In[16]: X_combined = impute(X_combined, null_combined) X_combined.isnull().sum() # In[17]: def get_objectIndices(X): objectIndices = [] for column in X: if X[column].nunique() < 50: objectIndices.append(X.columns.get_loc(column)) return objectIndices def get_numericIndices(X): numericIndices = [] for column in X: if X[column].nunique() >= 50: numericIndices.append(X.columns.get_loc(column)) return numericIndices def get_numericColumnName(X): numericColumnName = [] for column in X: if X[column].nunique() >= 50: numericColumnName.append(column) return numericColumnName # In[19]: # In[20]: # def remove_num_corr(X): # numericColumnName = get_numericColumnName(X) # numeric_combined = X.loc[:, numericColumnName] # numeric_combined_corr = numeric_combined.corr() # for i in numeric_combined_corr: # numericCorrCount = numeric_combined_corr[i].where(lambda x: abs(x) >= .25).count() # if numericCorrCount > 5: # X = X.drop(i, axis=1, inplace=False) # return X # # X_combined = remove_num_corr(X_combined) numericColumnName = get_numericColumnName(X_combined) scaler = preprocessing.StandardScaler() scaler.fit(X_combined[numericColumnName]) X_combined[numericColumnName] = scaler.transform( X_combined[numericColumnName]) # scaler = preprocessing.MinMaxScaler() # scaler.fit(X_combined[numericColumnName]) # X_combined[numericColumnName] = scaler.transform(X_combined[numericColumnName]) # scaler = preprocessing.RobustScaler() # scaler.fit(X_combined[numericColumnName]) # X_combined[numericColumnName] = scaler.transform(X_combined[numericColumnName]) # In[22]: # In[23]: # pca = PCA(.9) # pca.fit(X_combined[numericColumnName]) # a = pca.transform(X_combined[numericColumnName]) # a.shape # X_combined = X_combined.drop(numericColumnName, axis=1, inplace=False) objectIndices = get_objectIndices(X_combined) le = preprocessing.LabelEncoder() X_combined = X_combined.apply(le.fit_transform) onehotencoder = preprocessing.OneHotEncoder( categorical_features=objectIndices) X_combined = onehotencoder.fit_transform(X_combined).toarray() # X_combined = np.concatenate((X_combined, a), axis=1) return X_combined
def one_hot_transform(formalarray,input_label):#把原始编码转换成One-hot编码 enc=pre.OneHotEncoder() enc.fit(formalarray) return enc.transform(input_label).toarray()
freq_of_common_class[cls] = freq_of_common_class.get(cls, 0) + 1 ###print("\nThe Frequency of occurrence of the common classes in the testing data set", freq_of_each_class) # Pick 3 Classes with the most number of images from the common Classes counts = nlargest(3, freq_of_common_class.values()) classes_to_be_considered = [ key for key, value in freq_of_common_class.items() if value in counts ] print("The 3 Common Classes have a total number of images: ", sum(counts)) print("The 3 Common Classes that have the highest number of images are: ", classes_to_be_considered) ###print("\nClasses that will be considered: ", classes_to_be_considered) # Transform labels from a list of strings to a list of Numbers numerical_form_classes = np.asarray( [classes_to_be_considered.index(t) for t in classes_to_be_considered]) ###print("\nClasses that will be considered in Numerical form: ", numerical_form_classes) classes_onehot = preprocessing.OneHotEncoder(sparse=False).fit_transform( numerical_form_classes.reshape(-1, 1)) ###print("One hot vector of Classes that will be considered is", classes_onehot) # Dictionary with Class name and Corresponding label corres_label = dict(zip(classes_to_be_considered, classes_onehot)) ###print("\nDictionary with Class name and Corresponding label", corres_label) # The total number of images that will be considered is: # Build the model's Training Data set training_data = [] readable_training_data = [] for folder in training_classes: if folder in classes_to_be_considered: path = TRAIN_DIR + folder files = os.listdir(path) # print(files)
X_label = [] X_label1 = [] for i in range(0, len(my_data[0])): if i in unique_list: for j in range(0, len(unique_list[i])): X_label.append(unique_list[i][j] + str(i)) else: X_label1.append(my_data[0][i] + str(i)) i = 0 print X_label1 for i in range(0, len(X_label1)): X_label.append(X_label1[i]) X = [X_label] print len(X) enc = preprocessing.OneHotEncoder(categorical_features=categories) enc.fit(encoder) for row in my_data[1:]: X_small = enc.transform(encode(row)).toarray() X.append(X_small[0].tolist()) #myfile = open("processed_data_new_v2.csv",'wb') #wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) #for row in X: # wr.writerow(row) building_model_accuracy(X)
def run(fold): # load the full training data with folds df = pd.read_csv("../inputs/train-folds.csv") # list of all numerical columns num_cols = [ "age", "fnlwgt", "capital-gain", "capital-loss", "hours-per-week" ] # drop the numerical columns for simplicity df = df.drop(num_cols, axis=1) # remove white-spacing from the values of income column df["income"] = df.income.str.strip() # map targets to 0s and 1s, . target_mapping = {"<=50K": 0, ">50K": 1} df.loc[:, "income"] = df.income.map(target_mapping) # all the categorical features except income & kfold features = [x for x in df.columns if x not in ("kfold", "income")] # handling NaN values, note that converting all columns to "strings" # it doesnt matter because all are categories for col in features: df.loc[:, col] = df[col].astype(str).fillna("NONE") # training dataset df_train = df[df.kfold != fold].reset_index(drop=True) # vaidation dataset df_valid = df[df.kfold == fold].reset_index(drop=True) # initailize the OneHotEncoding from scikit-learn module ohe = preprocessing.OneHotEncoder() # fit ohe on training + validation features full_data = pd.concat([df_train[features], df_valid[features]], axis=0) ohe.fit(full_data[features]) # get training data using folds x_train = ohe.transform(df_train[features]) # get validation data using folds x_valid = ohe.transform(df_valid[features]) # initalize xgboost model model = xg.XGBClassifier(max_depth=7, n_estimators=200, n_jobs=-1) # fit model on training data model.fit(x_train, df_train.income.values) # predict the probability to get 1s, need to predict # probability values as we are calculating AUC yhat_ones = model.predict_proba(x_valid)[:, 1] # get roc auc score auc = metrics.roc_auc_score(df_valid.income.values, yhat_ones) fpr, tpr, threshold = metrics.roc_curve(yhat_ones, df_valid.income.values) # print auc at each fold print(f"Fold = {fold}, AUC = {auc}")
def _one_hot(self): self.ohe = preprocessing.OneHotEncoder() self.ohe.fit(self.df[self.cat_feats].values) return self.ohe.transform(self.df[self.cat_feats].values)
dataset = dataset.sort_values(by=[' Start time']) X = dataset.iloc[:, 5:].values Y = (dataset[' Event Name']) # deal with nan padding X = np.nan_to_num(X) # min-max scale X mmScaler = preprocessing.MinMaxScaler() X = mmScaler.fit_transform(X) # X = X.reshape((-1, 8, 279)) # take every eight row (8 channels) as a sample # lb = preprocessing.LabelBinarizer() # Y = lb.fit_transform(np.expand_dims(Y, axis=1)) encoder =preprocessing.OneHotEncoder(categories='auto') Y = encoder.fit_transform(np.expand_dims(np.asarray(Y), axis=1)).toarray() # Y = Y[0:-1:8] # take every eight row as a label # separate training and test_result set X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=3) # Using SMOTE oversampling ###################### # X_train = X_train.reshape((len(X_train), -1)) # reshape x so that it can be resampled # X_train, Y_train = smote.fit_sample(X_train, Y_train) # X_train = X_train.reshape((len(X_train), 8, -1)) # reshape x back into 8 channel format # Using Class Weighing ########################## # classWeight = compute_class_weight('balanced', np.unique(Y), Y)
i[2]='1' #transform test data test_cat1 = le1.transform([i[0] for i in test]) test_cat2 = le2.transform([i[1] for i in test]) test_cat3 = le3.transform([i[2] for i in test]) test_cat4 = le4.transform([i[3] for i in test]) test_cat5 = le5.transform([i[4] for i in test]) test_cat6 = le6.transform([i[5] for i in test]) test_cat7 = le7.transform([i[6] for i in test]) test_cat8 = le8.transform([i[7] for i in test]) test_cat9 = le9.transform([i[8] for i in test]) test_cat = [[test_cat1[i],test_cat2[i],test_cat3[i],test_cat4[i],test_cat5[i],test_cat6[i],test_cat7[i],test_cat8[i],test_cat9[i]] for i in range(len(test_cat1))] #create dummy vars enc = preprocessing.OneHotEncoder(sparse=True) enc.fit(X_cat) X = enc.transform(X_cat) test = enc.transform(test_cat) #don't need stores with 0 sales X = X[Y>0] Y = Y[Y>0] #log transform sales Y = np.log(Y) #Do some cross val testing kf = KFold(np.shape(X)[0], n_folds=5) i=0 rmspe=[]
# In[148]: test = test.dropna(axis=0) # In[149]: test.shape # In[234]: features = train['Sex'] enc = preprocessing.LabelEncoder() enc.fit(features) features = enc.transform(features) ohe = preprocessing.OneHotEncoder() encoded = ohe.fit(features.reshape(-1, 1)) features = encoded.transform(features.reshape(-1, 1)).toarray() # In[151]: features1 = train['Embarked'] enc = preprocessing.LabelEncoder() enc.fit(features1) features1 = enc.transform(features1) ohe = preprocessing.OneHotEncoder() encoded = ohe.fit(features1.reshape(-1, 1)) features1 = encoded.transform(features1.reshape(-1, 1)).toarray() # In[235]:
if 'classification' in datasets[datasetname]['probtype']: with open(datasets[datasetname]['filepath'], 'rb') as fl: df = pk.load(fl) # check that there are no missing values assert (np.all(np.logical_not(df.isna()))), 'Nan values present' ycols = datasets[datasetname]['targets'] xcolsnum = list( set(df.select_dtypes([np.number]).columns) - set(ycols)) xcolsnonnum = list( set(df.select_dtypes([object]).columns) - set(ycols)) if len(xcolsnonnum) > 0: # one-hot encoding of any categorical variables Xnonnum = df.loc[:, xcolsnonnum].values ohe = pp.OneHotEncoder(sparse=False, drop='first') ohe.fit(Xnonnum) XnonnumOhe = ohe.transform(Xnonnum) # check that the excluded variable is the first variable excluded = ohe.categories_[0][0] idx = XnonnumOhe.sum( axis=1 ) == 0 # find all rows that don't fit in another category assert np.all(Xnonnum[idx] == excluded) # concatenate to arrive at final arrays xcols = xcolsnum + xcolsnonnum X = df.loc[:, xcolsnum].values y = np.ravel(df.loc[:, ycols].values) if len(xcolsnonnum) > 0:
import numpy as np from sklearn import preprocessing # create random 1-d array with 1001 different categories (int) example = np.random.randint(1000, size=1000000) # initialize OntHotEncoder from scikit-learn # keep sparse = False to get dense array ohe = preprocessing.OneHotEncoder(sparse=False) # fit and transform data with dense one hot encoder ohe_example = ohe.fit_transform(example.reshape(-1, 1)) # print(size in bytes for dense array) print(f"Size of dense array: {ohe_example.nbytes}") # initialize OntHotEncoder from scikit-learn # keep sparse = True to get dense array ohe = preprocessing.OneHotEncoder(sparse=True) # fit and tranform data with sparse one-hot encoder ohe_example = ohe.fit_transform(example.reshape(-1, 1)) # print size of this sparse matrix print(f"Size of dense array: {ohe_example.data.nbytes}") full_size = (ohe_example.data.nbytes + ohe_example.indptr.nbytes + ohe_example.indices.nbytes) # print full size of this sparse matrix print(f'Full size of sparse array: {full_size}')
CalleEstado = np.array(dataframe.CalleEstado.values) #print(CalleEstado) subset = dataframe[["Dia", "TipoCalle", "Iluminacion", "Clima", "CalleEstado"]] #print(subset) valores = np.array(subset.values) #print(valores) diasCate = [1, 2] TipoCalleCate = [1, 2, 3, 4, 5] IluminacionCate = [1, 2, 3, 4] ClimaCate = [0, 1, 2, 3, 4] CalleEstadoCate = [0, 1, 2, 3, 4] enc = preprocessing.OneHotEncoder(categories=[ diasCate, TipoCalleCate, IluminacionCate, ClimaCate, CalleEstadoCate ]) fit = enc.fit(valores) arreglo = enc.transform(valores).toarray() OHE = pd.DataFrame({ 'DiaFinde': arreglo[:, 0], 'DiaLaboral': arreglo[:, 1], "Autopista": arreglo[:, 2], "AutopistaDoble": arreglo[:, 3], "1Via": arreglo[:, 4], "Redondel": arreglo[:, 5], "Entrada": arreglo[:, 6], "LuzDia": arreglo[:, 7], "LuzNoche": arreglo[:, 8],
#type cast features features_to_cast = ['MSSubClass'] utils.cast_to_cat(house_train, features_to_cast) #manual feature selection features_to_drop = ['Id', 'SalePrice'] missing_features_above_th = utils.get_features_to_drop_on_missingdata( house_train, 0.25) features_to_drop.extend(missing_features_above_th) house_train1 = utils.drop_features(house_train, features_to_drop) house_train1.info() #build pipeline for categorical features categorical_pipeline = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')) ]) #build pipeline for numerical features numerical_pipeline = pipeline.Pipeline([('imputer', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())]) #build preprocessing pipeline for all features cat_features = utils.get_non_continuous_features(house_train1) num_features = utils.get_continuous_features(house_train1) preprocess_pipeline = compose.ColumnTransformer([ ('cat', categorical_pipeline, cat_features), ('num', numerical_pipeline, num_features) ])
def Initialize(self, trainFileName, devFileName, testFileName): trainList = [] trainResult = [] self.testFeatures = [] self.devFeatures = [] self.trainFeatures = [] self.train = [] #self.dev = [] #self.test = [] self.devResult = [] self.rawResult = [] print "train feature processing..." with open(trainFileName) as trainFile: for line in trainFile: line = line.decode('utf-8').strip() if not line: continue space = line.find(" ") if space < 5: continue answer, train = line[:space].upper(), line[space + 1:] li, ans = self.lineProc(train, answer, True) trainList += li trainResult += ans self.trainFeatures.append(li) self.rawResult.append(self.languages[answer]) with open(devFileName) as devFile: for line in devFile: line = line.decode('utf-8').strip() if not line: continue space = line.find(" ") if space < 5: continue answer, train = line[:space].upper(), line[space + 1:] li = self.lineProc(train, answer, False) self.devFeatures.append(li) self.devResult.append(self.languages[answer]) with open(testFileName) as testFile: for line in testFile: if not line: continue line = line.decode('latin-1').strip() test = self.lineProc(line, "", False) self.testFeatures.append(test) trainList, trainResult = self.FisherYatesShuffle( trainList, trainResult) trainResult = np.array(trainResult) self.trainResult = self.answerLables.fit_transform(trainResult) self.trainLabels = preprocessing.LabelEncoder() featureList = list(self.c) self.trainLabels.fit(featureList) #print self.trainLabels.classes_ length = len(self.c) print "feature length:", length self.v = preprocessing.OneHotEncoder(n_values=length) trainList = np.array(trainList) self.train = self.trainLabels.transform( trainList.ravel()).reshape(*trainList.shape) self.train = self.v.fit_transform(self.train).toarray() print "train shape", self.train.shape
test = test.drop('Trip_ID', axis=1) train = train.drop(['Trip_ID', 'Surge_Pricing_Type'], axis=1) for f in cat_cols: print(f) lbl = preprocessing.LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) train = train.fillna(0).values test = test.fillna(0).values y = y - 1 ohe = preprocessing.OneHotEncoder(categorical_features=[1, 4, 5, 11], sparse=False) ohe.fit(train) train = ohe.transform(train) test = ohe.transform(test) scl = preprocessing.StandardScaler() train = scl.fit_transform(train) test = scl.transform(test) y_enc = np_utils.to_categorical(y) def nn_model(): model = Sequential() print('Build model...')
def one_hot_encoding(): encoder = preprocessing.OneHotEncoder(categories='auto') encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]]) encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray() print("\nEncoded vector =", encoded_vector)
cat_imputer = CategoricalImputer() cat_imputer.fit(titanic_train['Embarked']) print(cat_imputer.fill_) titanic_train['Embarked'] = cat_imputer.transform(titanic_train['Embarked']) encodable_columns = ['Sex', 'Embarked', 'Pclass'] feature_defs = [(col_name, preprocessing.LabelEncoder()) for col_name in encodable_columns] mapper = DataFrameMapper(feature_defs) mapper.fit(titanic_train) titanic_train[encodable_columns] = mapper.transform(titanic_train) titanic_train1 = titanic_train.drop( ['PassengerId', 'Name', 'Cabin', 'Ticket', 'Survived'], axis=1) one_hot_encoder = preprocessing.OneHotEncoder( categorical_features=np.array([0, 1, 6])) one_hot_encoder.fit(titanic_train1) print(one_hot_encoder.n_values_) X_train = one_hot_encoder.transform(titanic_train1).toarray() y_train = titanic_train[['Survived']] dt_estimator = tree.DecisionTreeClassifier(random_state=100) dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 4, 5, 6, 7, 8]} grid_dt_estimator = model_selection.GridSearchCV(dt_estimator, dt_grid, cv=10, refit='True', return_train_score=True) grid_dt_estimator.fit(X_train, y_train) print(grid_dt_estimator.best_estimator_)
for code_table in code_tables: size = len(code_tables)#每一列有多少个不同的值 sortcode_table = sorted(code_table.keys())#每一列不同数字,从小到大排序 for key,val in enumerate(sortcode_table): print(key,val) # code_table[val] = np.zeros(shape=size)#创建多少个0 # code_table[val][key] = 1 #按字典编码 ohe_samples = [] for row in raw_samples: ohe_sample = np.array([],dtype=int) for key,val in enumerate(row): ohe_sample = np.hstack( ohe_sample,code_tables[key][val] )#水平拼接 ohe_samples.append(ohe_sample) #独热编码 one = sp.OneHotEncoder(sparse=True,dtype='int') ohe_samples = one.fit_transform(raw_samples) print(ohe_samples) new_samples = np.array([#用已有字典去模拟,如果列内出现字典里没有的编码,结果将会出错 [7,8,9], [2,5,2], ]) ohe_samples2 = one.transform(new_samples) print(ohe_samples)
def __init__(self): self.std_scaler = preprocessing.StandardScaler() self.oht_scaler = preprocessing.OneHotEncoder() self.std_scaled = False self.oht_scaled = False
# numeric ('numeric_variables_processing', pipeline.Pipeline( steps=[('selecting', preprocessing.FunctionTransformer( lambda data: data[:, numeric_data_indices]) ), ('scaling', preprocessing.StandardScaler(with_mean=0))])), # categorical ('categorical_variables_processing', pipeline.Pipeline( steps=[('selecting', preprocessing.FunctionTransformer( lambda data: data[:, categorical_data_indices])), ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown='ignore'))])), ] #SGDRegressor regressor = linear_model.Lasso(max_iter=2000) estimator = pipeline.Pipeline(steps=[( 'feature_processing', pipeline.FeatureUnion( transformer_list=transformer_list)), ('model_fitting', regressor)]) estimator.fit(train_data, train_labels) predicted = estimator.predict(test_data) print("RMSLE: ", rmsle(test_labels, predicted))
import tensorflow as tf import numpy as np import pandas as pd import fashion_data_import as fin from sklearn import preprocessing enc = preprocessing.OneHotEncoder() # creating new encoder object # logpath LOG_PATH = '/home/wataru/machineLearn/kaggle/TF_practice/estimator/customEstimator/log' # importing the fashion MNIST data from the script as numpy arrays Xtrain, ytrain = fin.data_in('train') Xtest, ytest = fin.data_in('test') num_labels = np.unique(ytrain).size # one hot matrix for labels enc.fit(ytrain) ytrain = enc.transform(ytrain).toarray() enc.fit(ytest) ytest = enc.transform(ytest).toarray() # parameters learning_rate = 0.001 batchSize = 500 LOGDIR = '/home/wataru/machineLearn/kaggle/TF_practice/estimator/customEstimator/log2' num_iter = 50000 def model_fn(features, labels, mode, params):
# Preprocess categorical labels from sklearn import preprocessing from sklearn.pipeline import Pipeline import pandas as pd raw_data = { 'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 'age': [42, 52, 36, 24, 73], 'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston'] } df = pd.DataFrame(raw_data, columns=['first_name', 'last_name', 'age', 'city']) df # Create dummy variables for every unique category in df.city pd.get_dummies(df["city"]) # Convert strings categorical names to integers integerized_data = preprocessing.LabelEncoder().fit_transform(df["city"]) # View data integerized_data # Convert integer categorical representations to OneHot encodings preprocessing.OneHotEncoder().fit_transform(integerized_data.reshape( -1, 1)).toarray()
# TODO: create a LabelEncoder object and fit it to each feature in X # 1. INSTANTIATE # encode labels with value between 0 and n_classes-1. le = preprocessing.LabelEncoder() # 2/3. FIT AND TRANSFORM # use df.apply() to apply le.fit_transform to all columns X_2 = X.apply(le.fit_transform) print("head:", X_2.head()) print("shape of X2 after transform:", X_2) print("classes", le.classes_) # TODO: create a OneHotEncoder object, and fit it to all of X # 1. INSTANTIATE enc = preprocessing.OneHotEncoder() # 2. FIT enc.fit(X_2) print("shape fitttttt:", enc.fit(X_2)) # 3. Transform onehotlabels = enc.transform(X_2).toarray() print("shape:", onehotlabels.shape) print(onehotlabels) # as you can see, you've the same number of rows 891 # but now you've so many more columns due to how we changed all the categorical data into numerical data def getLabelEncoder(values1):
def _one_hot_encoding(self): one_hot_encoders = preprocessing.OneHotEncoder() one_hot_encoders.fit(self.df[self.cat_feats].values) return one_hot_encoders.transform(self.df[self.cat_feats].values)