'workex': 'category', 'specialisation': 'category', 'status': 'category' }).drop(columns=['sl_no']).dropna().assign(outlier=lambda x: (np.abs( zscore(x.select_dtypes('number'))) < 3).all(axis=1)).dropna().drop( columns=['status', 'outlier'])) y = df.pop('salary') # df.head() # df.shape #%% Preprocessor functions ohe = ce.OneHotEncoder( drop_invariant=True, return_df=True, use_cat_names=True, handle_missing='return_nan') # Remember replace(np.nan, 0) tge = ce.TargetEncoder( drop_invariant=True, return_df=True, handle_missing='value', # min_samples_leaf=3, # smoothing=0.4, ) num_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'] cat_cols = [ 'gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation' ]
'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff' ] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. encoders = [ #category_encoders.BackwardDifferenceEncoder(), category_encoders.BaseNEncoder(), category_encoders.BinaryEncoder(), category_encoders.HashingEncoder(), # category_encoders.HelmertEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.LeaveOneOutEncoder(), category_encoders.MEstimateEncoder(), category_encoders.OneHotEncoder(), category_encoders.OrdinalEncoder(), # category_encoders.PolynomialEncoder(), # category_encoders.SumEncoder(), category_encoders.TargetEncoder(), category_encoders.WOEEncoder() ] # Initialization if os.path.isfile('./output/result.csv'): os.remove('./output/result.csv') # Loop over datasets, then over encoders, and finally, over the models for dataset_name in datasets: X, y, fold_count = arff_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns.values)
df['Desc'] = df['Desc'].apply(remove_symbol) print(df.head(10)) train, test = train_test_split(df, test_size=0.3, random_state=42) # Acquisizione delle stop word file_stopw = open("stop_word.pck", "rb") stop_word = pickle.load(file_stopw) # prepare tokenizer t = Tokenizer() t.fit_on_texts(df['Desc']) vocab_size = len(t.word_index) + 1 #prepare class encoder le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore") labels = le.fit(list(df['Code'])) print(labels) print(le.category_mapping) print(len(le.category_mapping)) # integer encode the documents encoded_train = t.texts_to_sequences(train['Desc']) max_length = 256 padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post') print(padded_train) train_labels = train['Code']
import numpy as np import matplotlib.pyplot as plt import pandas as pd from keras import regularizers from keras.models import Sequential from keras.layers import Dense, Dropout from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.model_selection import train_test_split from keras.optimizers import Adam, SGD from sklearn.utils import shuffle from keras import regularizers import category_encoders as ce oneHotEncoder = ce.OneHotEncoder(cols=[0, 1, 2]) np.random.seed(133) ENCODERS_FIT = False AGE_LIMIT = 1991 def evaluate_predictions(y_pred, y_real, buffer): correct_count = 0 n = len(y_pred) for i in range(n):
#Remove duplicates to get unique events/sensors unique_events = list(dict.fromkeys(all_events)) unique_sensors = list(dict.fromkeys(all_sensors)) #Use humansorting on events/sensors unique_events.sort(key=natural_keys) unique_sensors.sort(key=natural_keys) all_sensors.sort(key=natural_keys) all_events.sort(key=natural_keys) unique_sensors_dataframe = pd.DataFrame(data=unique_sensors, columns=['sensor']) all_sensors_dataframe = pd.DataFrame(data=all_sensors, columns=['sensor']) unique_events_dataframe = pd.DataFrame(data=unique_events, columns=['event']) all_events_dataframe = pd.DataFrame(data=all_events, columns=['event']) sensor_encoder = category_encoders.OneHotEncoder(cols=['sensor']) event_encoder = category_encoders.OneHotEncoder(cols=['event']) sensor_encoder.fit(unique_sensors_dataframe) event_encoder.fit(unique_events_dataframe) sensors_classes = sensor_encoder.transform(unique_sensors_dataframe) encoded_sensors = sensor_encoder.transform(all_sensors_dataframe) event_classes = event_encoder.transform(unique_events_dataframe) encoded_events = event_encoder.transform(all_events_dataframe) sensor_results = combine_df_columns(encoded_sensors) event_results = combine_df_columns(encoded_events) event_results2 = dict()
return X train_race, val_race = train_test_split(trainval, random_state=42) train = wrangle(train_race) val = wrangle(val_race) X_train = train.drop(columns=target) X_val = val.drop(columns=target) y_train = train[target] y_val = val[target] numeric_features = train_features.select_dtypes( include='number').columns.tolist() cardinality = train_features.select_dtypes(exclude='number').nunique() categorical_features = cardinality[cardinality <= 2000].index.tolist() features = numeric_features + categorical_features # model model = make_pipeline( ce.OneHotEncoder(use_cat_names=True, handle_unknown='ignore'), SimpleImputer(strategy='median'), RandomForestClassifier(random_state=0, n_jobs=-1)) # Fit on train, score on val model.fit(X_train, y_train) model.predict(X_val) print(model) dump(model, 'model.pkl')
for i in X['Per Share Net profit before tax']: if i < 0.17037: x.append('low') elif i >= 0.170370 and i < 0.179709: x.append('low-medium') elif i >= 0.179709 and i < 0.193493: x.append('high-medium') else: x.append('high') X['Per Share Net profit before tax bin']=pd.Series(x) pip install category_encoders #one hot encode import category_encoders as ce encoder=ce.OneHotEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],handle_unknown='return_nan',return_df=True,use_cat_names=True) X = encoder.fit_transform(X) X.head() #hash encode #Create object for hash encoder encoder=ce.HashingEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],n_components=4) #Fit and Transform Data X=encoder.fit_transform(X) X.head() #Create object for binary encoding encoder= ce.BinaryEncoder(cols=['Per Share Net profit before tax bin','Retained Earnings to Total Assets bin'],return_df=True) X=encoder.fit_transform(X)
import category_encoders as ce from sklearn import ensemble import xgboost as xgb NOM_ENCODER = { 'OneHotEncoder' : ce.OneHotEncoder(cols=['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']) } MODELS = { 'RandomForest': ensemble.RandomForestClassifier(n_estimators=200, n_jobs=1, verbose= 2), 'xgBoost': xgb.XGBClassifier(max_depth=15, learning_rate = 0.03, n_estimators=400, verbosity=1, objective='binary:logistic') }
"""START: Import encoders""" import category_encoders as ce import sys sys.path.append('../encoders/') from ceng import CENGEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder from entity_embedding import EntityEmbeddingEncoder from cesamo import CESAMOEncoder Encoders = { 'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose=0), 'GeneticPP': GeneticPPEncoder(estimator_name='LinearRegression', num_predictors=2), 'AgingPP':
def norm_data(X_train, X_test, y_train, y_test, real=None, categ=None, all=True): '''Preprocessing features''' # ------------- Split data on real and categ ----------------- X_train_categ = np.hstack((X_train[:, :2], X_train[:, 81:82])) X_test_categ = np.hstack((X_test[:, :2], X_test[:, 81:82])) X_train_real = np.hstack((X_train[:, 2:81], X_train[:, 82:])) X_test_real = np.hstack((X_test[:, 2:81], X_test[:, 82:])) # ------- Check flag that we want to use all data for encoding -------- if all == True: X_all_categ = np.append(X_train_categ, X_test_categ, axis=0) #print (X.shape, X_train_categ.shape, X_test_categ.shape) y_all = np.append(y_train, y_test, axis=0) #print (y_all.shape, y_train.shape, y_test.shape) else: X_all_categ = X_train_categ y_all = y_train # ------- Norm of real data on mean and deviation -------- if real == 'standart': ss = StandardScaler() X_train_real_res = ss.fit_transform(X_train_real) X_test_real_res = ss.transform(X_test_real) elif real == 'normal': min_max_scaler = preprocessing.MinMaxScaler() X_train_real_res = min_max_scaler.fit_transform(X_train_real) X_test_real_res = min_max_scaler.transform(X_test_real) else: X_train_real_res = X_train_real X_test_real_res = X_test_real # ------- Encoding of categorical features ----------- if categ == 'target': encoder = ce.TargetEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'onehot': encoder = ce.OneHotEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'helmert': encoder = ce.HelmertEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) elif categ == 'hash': encoder = ce.HashingEncoder(cols=[0, 1, 2], return_df=False) encoder.fit(X_all_categ, y_all) X_train_categ_res = encoder.transform(X_train_categ) X_test_categ_res = encoder.transform(X_test_categ) else: X_train_categ_res = X_train_categ X_test_categ_res = X_test_categ # ------------ Joy data together --------------- X_train_ready = np.hstack((X_train_categ_res, X_train_real_res)) X_test_ready = np.hstack((X_test_categ_res, X_test_real_res)) return X_train_ready, X_test_ready
train_processed.reset_index(drop=True, inplace=True) target = train_processed["rent"] target_log = np.log1p(target) train_processed.drop(["id", "rent"], axis=1, inplace=True) test_processed.drop("id", axis=1, inplace=True) #################### ## get feature #################### # モデル学習用データフレーム(category encoderの都合で分ける) train_use = pd.DataFrame() test_use = pd.DataFrame() ### location ### ce_ordinal = ce.OneHotEncoder(cols=["district"], handle_missing="value") train_use["district"] = train_processed["district"] test_use["district"] = test_processed["district"] train_use = ce_ordinal.fit_transform(train_use) test_use = ce_ordinal.transform(test_use) ### access ### train_use["min_to_nearest_sta"] = train_processed["access_min"].apply( lambda x: min(x) if x else np.nan) test_use["min_to_nearest_sta"] = test_processed["access_min"].apply( lambda x: min(x) if x else np.nan) train_use["num_sta"] = train_processed["access_sta"].apply(lambda x: len(x)) test_use["num_sta"] = test_processed["access_sta"].apply(lambda x: len(x)) # 路線
def one_hot(data, column_names): #Encoding the data, encoding the string values into numerical values, using binary method. encoder = ce.OneHotEncoder(column_names) data_transformed = encoder.fit_transform(data) return (data_transformed)
# A - Label encoding z = pd.DataFrame() z['state'] = df['State'] le = LabelEncoder() z['le_state'] = le.fit_transform(np.ravel(z)) print("label encode ~ state") print('') # data quality issue print('Max categorical value for state is %s.\n' % z['le_state'].max()) # B - Hot One encoding y = pd.DataFrame() y['state'] = df['State'] oh = ce.OneHotEncoder(cols=['state']) x = oh.fit_transform(y) w = pd.concat([y, x], axis=1, ignore_index=False) print("hot one encode ~ state") print('') # C - Binary encoding v = pd.DataFrame() v['state'] = df['State'] be = ce.BinaryEncoder(cols=['state']) u = be.fit_transform(v) t = pd.concat([v, u], axis=1, ignore_index=False) print("binary encode ~ state") print('')
#dirty columns include: kids, courses none_i = re.compile(r'none', flags=re.IGNORECASE) # df.kids = none_i.sub(r'none\i', df.kids) df['kids'].replace(none_i, 0, inplace=True) df.kids = df['kids'].str.extract(r'^(\d+)', expand=False) print(f'kids are {df.kids.unique()}') print(f'gender are {df.gender.unique()}') print(f'industry are {df.industry.unique()}') print(f'military are {df.Military.unique()}') # print(f'courses are {df.NumCourses.unique()}') onehotecoder = ce.OneHotEncoder(cols=[ "gender", "InUS", "ethnicity", "Usstate", "marrital", "employment", "industry" ], handle_unknown='impute') df = onehotecoder.fit_transform(df) col_non_num = [c for c in df.columns if df[c].dtype == 'object'] print('no error2') df.drop(columns=col_non_num, inplace=True) print(df.shape) print(df.dtypes) print(df.head(10)) # fill with mode, mean, or median df_mode, df_mean, df_median = df.mode().iloc[0], df.mean(), df.median()
def predict(user_data): m_path = Path(__file__).parent path = m_path.joinpath('dataset/clean_data.csv') df = pd.read_csv(str(path)) df = df.loc[df['Current contraceptive method'] != 'Not using'] df['Current contraceptive method'] = df['Current contraceptive method'].replace('Calendar or rhythm method/Periodic abstinence', 'Periodic abstinence', regex=True) df['Current contraceptive method'] = df['Current contraceptive method'].replace('Implants/Norplant', 'Implants', regex=True) df['Current contraceptive method'] = df['Current contraceptive method'].replace('Mucus/Billing/Ovulation', 'Ovulation', regex=True) columns = ["Respondent's current age", 'Age of respondent at 1st birth', 'Age at first menstrual period', 'Recent sexual activity', 'Region', 'Type of place of residence', 'Current marital status', 'Births in last five years', 'Births in last three years', 'Births in past year', 'Currently pregnant', 'Total number all pregnacies', 'Decision maker for using contraception', 'Decision maker for not using contraception', 'Preferred future method', 'Smokes cigarettes', 'Smokes pipe full of tobacco', 'Chews tobacco', 'Snuffs by nose', 'Smokes kreteks', 'Smokes cigars, cheroots or cigarillos', 'Smokes water pipe', 'Snuff by mouth', 'Chews betel quid with tobacco', "Husband's desire for children", 'Exposure', 'Unmet need', 'Unmet need (definition 2)', 'Unmet need for contraception (definition 3)' ] X = df[columns] y = df['Current contraceptive method'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) X_encoder = ce.OneHotEncoder(cols=[ 'Recent sexual activity', 'Region', 'Type of place of residence', 'Current marital status', 'Currently pregnant', 'Decision maker for using contraception', 'Decision maker for not using contraception', 'Preferred future method', 'Smokes cigarettes', 'Smokes pipe full of tobacco', 'Chews tobacco', 'Snuffs by nose', 'Smokes kreteks', 'Smokes cigars, cheroots or cigarillos', 'Smokes water pipe', 'Snuff by mouth', 'Chews betel quid with tobacco', "Husband's desire for children", 'Exposure', 'Unmet need', 'Unmet need (definition 2)', 'Unmet need for contraception (definition 3)' ]) # X_train = X_encoder.fit_transform(X_train) # X_test = X_encoder.transform(X_test) rf_classifier = RandomForestClassifier(n_estimators=100) # rf_classifier.fit(X_train, y_train) # Preprocess, Use Model, and Train model = Pipeline([("preprocessing",X_encoder),("model",rf_classifier)]).fit(X_train, y_train) user_encode = model.predict(user_data) # Retrieve and return text result_text = user_encode[0] return result_text
# train test split the data (80/20) train, val = train_test_split(df, train_size=0.80, test_size=.20, stratify=df[target], random_state=42) #%% X_train = train[features] y_train = train[target] X_val = val[features] y_val = val[target] #%% # fit a pipeline (Decision Tree) pipelineTree = make_pipeline(ce.OneHotEncoder(use_cat_names=True), SimpleImputer(strategy='mean'), StandardScaler(), DecisionTreeClassifier(max_depth=3)) pipelineTree.fit(X_train, y_train) #%% # validation accuracy (Decision Tree) y_pred_tree = pipelineTree.predict(X_val) print('Validation Accuracy', accuracy_score(y_val, y_pred_tree)) #%% y_pred_tree #%% # fit a pipeline (Random Forest)
def main(dataSetName, X, y): scores = [] raw_scores_ds = {} # Loading logistic regression classifier clf = linear_model.LogisticRegression() # try every encoding method available #encoders = ce.__all__ encoders = [ "BackwardDifferenceEncoder", "BinaryEncoder", "HashingEncoder", "HelmertEncoder", "OneHotEncoder", "OrdinalEncoder", "SumEncoder", "PolynomialEncoder", "BaseNEncoder", "LeaveOneOutEncoder" ] print(encoders) for encoder_name in encoders: print(encoder_name) if (encoder_name == "BackwardDifferenceEncoder"): encoder = ce.BackwardDifferenceEncoder(cols=columnsToEncode) if (encoder_name == "BinaryEncoder"): encoder = ce.BinaryEncoder(cols=columnsToEncode) if (encoder_name == "HashingEncoder"): encoder = ce.HashingEncoder(cols=columnsToEncode) if (encoder_name == "HelmertEncoder"): encoder = ce.HelmertEncoder(cols=columnsToEncode) if (encoder_name == "OneHotEncoder"): encoder = ce.OneHotEncoder(cols=columnsToEncode) if (encoder_name == "OrdinalEncoder"): encoder = ce.OrdinalEncoder(cols=columnsToEncode) if (encoder_name == "SumEncoder"): encoder = ce.SumEncoder(cols=columnsToEncode) if (encoder_name == "PolynomialEncoder"): encoder = ce.PolynomialEncoder(cols=columnsToEncode) if (encoder_name == "BaseNEncoder"): encoder = ce.BaseNEncoder(cols=columnsToEncode) if (encoder_name == "LeaveOneOutEncoder"): encoder = ce.LeaveOneOutEncoder(cols=columnsToEncode) #encoder = getattr(category_encoders, encoder_name) print(encoder) start_time = time.time() score, stds, raw_scores, dim = score_models(clf, X, y, encoder, encoder_name, dataSetName) scores.append([ encoder_name, dataSetName[0], dim, score, stds, time.time() - start_time ]) raw_scores_ds[encoder_name] = raw_scores gc.collect() results = pd.DataFrame(scores, columns=[ 'Encoding', 'Dataset', 'Dimensionality', 'Avg. Score', 'Score StDev', 'Elapsed Time' ]) #print(raw_scores_ds) #raw = pd.DataFrame.from_dict(raw_scores_ds) #print(raw) #ax = raw.plot(kind='box', return_type='axes') #plt.title('Scores for Encodings on %s Dataset' % (name, )) #plt.ylabel('Score (higher better)') #for tick in ax.get_xticklabels(): #tick.set_rotation(90) #plt.grid() #plt.tight_layout() #plt.show() #return results, raw return results
def __init__(self, **params): super().__init__(**params) self.transformer = ce.OneHotEncoder(**self.transformer_params)
#data pre-processing df_bank, cat_cols_bank = bank_data_prep(bank_data) #df_adult, cat_cols_adult=adult_data_prep(adult_data) #%%calculate the memory usage of the prepared data frame BYTES_TO_MB = 0.000001 print(round(df_bank.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #round(df_adult.memory_usage(deep=True).sum()* BYTES_TO_MB, 3) #adult_data.info(memory_usage='deep') #%% different embedding # one-hot encoding start_time = time.time() one_hot_encoder = ce.OneHotEncoder(cols=cat_cols_bank) one_hot_transformed = one_hot_encoder.fit_transform(df_bank) print('computation time of one-hot :', time.time() - start_time) print( 'Memory usage after encoding: ', round(one_hot_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) # label encode start_time = time.time() label_encoder = ce.OrdinalEncoder(cols=cat_cols_bank) label_transformed = label_encoder.fit_transform(df_bank) print('computation time of label:', time.time() - start_time) print('Memory usage after encoding: ', round(label_transformed.memory_usage(deep=True).sum() * BYTES_TO_MB, 3)) #hash encoding with md5 hash function
encoder = OneHotEncoder() housing_cat_1hot = encoder.fit_transform( housing_category_encoded.reshape(-1, 1)) housing_cat_1hot # In[31]: encoder_test = OneHotEncoder() housing_test_cat_1hot = encoder_test.fit_transform( housing_test_category_encoded.reshape(-1, 1)) housing_test_cat_1hot # In[32]: encoder = ce.OneHotEncoder() housing_cat_reshaped = housing_category.values.reshape(-1, 1) encoder.fit(housing_cat_reshaped) X_cleaned = encoder.transform(housing_cat_reshaped) cat_data = X_cleaned.as_matrix() print(X_cleaned[0:5]) print(type(cat_data)) print(cat_data.shape) # In[33]: encoder_test = ce.OneHotEncoder() housing_test_cat_reshaped = housing_test_category.values.reshape(-1, 1) encoder_test.fit(housing_test_cat_reshaped) X_test_cleaned = encoder_test.transform(housing_test_cat_reshaped) cat_test_data = X_test_cleaned.as_matrix()
def main(): train_df = pd.read_csv(TRAIN_DATA_PATH) test_df = pd.read_csv(TEST_DATA_PATH) train_df["usage"] = "train" test_df["usage"] = "test" test_df["left"] = 100 df = pd.concat([train_df, test_df], axis=0) usage = df.loc[:, "usage"] label = df.loc[:, "left"] df = df.drop(["usage", "left"], axis=1) categorical_columns = [c for c in df.columns if df[c].dtype == 'object'] ce_ohe = ce.OneHotEncoder(cols=categorical_columns, handle_unknown='impute') encorded_df = ce_ohe.fit_transform(df) encorded_df = pd.concat([encorded_df, usage, label], axis=1) train = encorded_df[encorded_df["usage"] == "train"].drop( "usage", axis=1).reset_index(drop=True) test = encorded_df[encorded_df["usage"] == "test"].drop( "usage", axis=1).reset_index(drop=True) train_x = train.drop(["left", "index"], axis=1) train_y = train.loc[:, "left"] index = test.loc[:, "index"] test_x = test.drop(["left", "index"], axis=1) f = partial(objective, train_x, train_y) # 目的関数に引数を固定しておく study = optuna.create_study( direction='maximize') # Optuna で取り出す特徴量の数を最適化する study.optimize(f, n_trials=10) # 試行回数を決定する print('params:', study.best_params) # 発見したパラメータを出力する best_feature_count = study.best_params['n_features_to_select'] train_x, train_y = get_important_features(train_x, train_y, best_feature_count) n_splits = 10 best_params = get_best_params(train_x, train_y) submission = np.zeros((len(test_x), 1)) acc_scores = {} skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0) for i, (tr_idx, val_idx) in enumerate(skf.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) tr_dataset = lgb.Dataset(tr_x, tr_y) val_dataset = lgb.Dataset(val_x, val_y, reference=tr_dataset) model = get_model(tr_dataset, val_dataset, best_params) y_pred = model.predict(test_x) preds = pd.DataFrame(y_pred) submission += preds submission_df = pd.DataFrame(submission / n_splits) submission_df = pd.concat([index, submission_df], axis=1) print("#################################") print(submission_df) print("#################################") submission_df.to_csv(SAVE_DATA_PATH, header=False, index=False)
df = df.drop(columns=i) df = df.fillna(df.mode().iloc[0]) print(df.describe()) print(df.dtypes) print(df.head()) # Split dataset into training and testing sets X = df.drop(columns='Result') Y = df['Result'].copy() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=20) # Convert categorical data to numerical ohe = ce.OneHotEncoder(use_cat_names=True) X_train_ohe = ohe.fit_transform(X_train) X_test_ohe = ohe.transform(X_test) # Training MLP clf = MLPClassifier(solver='lbfgs', activation='logistic', learning_rate='constant', max_iter=600, hidden_layer_sizes=(10, 10)) clf.fit(X_train_ohe, Y_train) prediction = clf.predict(X_test_ohe) # Evaluate MLP print(clf.get_params()) print('The accuracy on test set is: ', clf.score(X_test_ohe, Y_test))
# import category encoders # # import category_encoders as ce # # encode remaining variables with one-hot encoding # # encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship', # # 'race', 'sex', 'native_country']) # # X_train = encoder.fit_transform(X_train) # # X_test = encoder.transform(X_test) # X_train.head() import category_encoders as ce #encode remaining variables with one hot encoding encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']) print(encoder) x_train = encoder.fit_transform(x_train) x_test = encoder.transform(x_test) print(x_train.shape) print(x_test.shape) # 11. Feature Scaling # Table of Contents cols = x_train.columns from sklearn.preprocessing import RobustScaler scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) x_train = pd.DataFrame(x_train, columns=[cols]) x_test = pd.DataFrame(x_test, columns=[cols])
# rimozione SOLO dei simboli (nessuno stemming e nessuna rimozione delle stopword) # df1['Desc'] = df1['Desc'].apply(remove_symbol) # test = df1 # Acquisizione delle stop word file_stopw = open("support/stop_word.pck", "rb") stop_word = pickle.load(file_stopw) # prepare tokenizer t = Tokenizer() t.fit_on_texts(df['Desc']) vocab_size = len(t.word_index) + 1 # prepare class encoder le = ce.OneHotEncoder(return_df=False, handle_unknown="ignore") labels = le.fit(list(df['Code'])) print(labels) # integer encode the documents encoded_train = t.texts_to_sequences(train['Desc']) max_length = 64 padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post') print(padded_train) # test_ids = df1['id'] test_ids = test['Code'] train_labels = train['Code'] # print(train_labels)
import pyreadr import pandas as pd from sklearn.model_selection import train_test_split import category_encoders as ce # feature eng df = pd.read_csv( 'Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv' ) df.drop(['history_segment', "conversion", "spend"], axis=1, inplace=True) column_names = df.columns cat_cols = ['zip_code', 'channel'] ce_one_hot = ce.OneHotEncoder(cols=cat_cols, use_cat_names=True) data_ohe = ce_one_hot.fit_transform(df) data_ohe.segment = data_ohe.segment.map({ 'Womens E-Mail': 1, 'Mens E-Mail': 1, 'No E-Mail': 0 }) data = data_ohe.copy() train = data_ohe.drop('visit', axis=1) column_names = list(train.columns) train_np = train.to_numpy().astype(float) treatment_col = column_names.index('segment') y = data_ohe.visit.to_numpy().astype(float) X_train, X_valid, Y_train, Y_valid = train_test_split(train_np, y, test_size=0.2, stratify=y, random_state=42)
def get_feature_encoders(data_df, features, categorical_columns): encoder = ce.OneHotEncoder(use_cat_names=True) encoded_feature_names = encoder.fit_transform(data_df).columns.tolist() return encoder, encoded_feature_names '''encoders = dict()
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Sun Jan 7 23:17:31 2018 @author: tgadfort """ #conda install -c conda-forge category_encoders #https://github.com/scikit-learn-contrib/categorical-encoding import category_encoders as ce encoder = ce.BackwardDifferenceEncoder(cols=[...]) encoder = ce.BinaryEncoder(cols=[...]) encoder = ce.HashingEncoder(cols=[...]) encoder = ce.HelmertEncoder(cols=[...]) encoder = ce.OneHotEncoder(cols=[...]) encoder = ce.OrdinalEncoder(cols=[...]) encoder = ce.SumEncoder(cols=[...]) encoder = ce.PolynomialEncoder(cols=[...]) encoder = ce.BaseNEncoder(cols=[...]) encoder = ce.LeaveOneOutEncoder(cols=[...])
if(row[j] == tmp[i]): flag = True if(flag == False): tmp.append(row[j]) row[j] = tmp.index(row[j]) print(row[j]) x,t = [],[] df_train = pd.read_csv("train.csv") df_test = pd.read_csv("test.csv") list_cols = ['week','soldout','name','remarks','event','payday','weather'] ce_ohe = ce.OneHotEncoder(cols=list_cols,handle_unknown='impute') df_train_ce_onehot = ce_ohe.fit_transform(df_train) df_test_ce_onehot = ce_ohe.fit_transform(df_test) train_len = len(df_train_ce_onehot) df_train_ce_onehot['precipitation'] = df_train_ce_onehot['precipitation'].str.replace('--','0') df_test_ce_onehot['precipitation'] = df_test_ce_onehot['precipitation'].str.replace('--','0') del df_train_ce_onehot['datetime'] del df_test_ce_onehot['datetime'] df_train_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer') df_test_ce_onehot = pd.merge(df_train_ce_onehot,df_test_ce_onehot,how='outer') train_t = df_train_ce_onehot.loc[:,['y']] train_t = train_t.drop(range(train_len,len(train_t)))
import warnings warnings.filterwarnings("ignore") """START: Import encoders""" import category_encoders as ce import sys sys.path.append('../encoders/') from ceng import CENGEncoder from cesamo import CESAMOEncoder from entity_embedding import EntityEmbeddingEncoder from pattern_preserving import SimplePPEncoder, AgingPPEncoder, GeneticPPEncoder Encoders = {'Ordinal': ce.OrdinalEncoder(), 'Polynomial': ce.PolynomialEncoder(), 'OneHot': ce.OneHotEncoder(), 'BackwardDifference': ce.BackwardDifferenceEncoder(), 'Helmert': ce.HelmertEncoder(), 'EntityEmbedding': EntityEmbeddingEncoder(), 'TargetEnc': ce.TargetEncoder(), 'WOE': ce.WOEEncoder(), 'CENG': CENGEncoder(verbose = 0), 'GeneticPP': GeneticPPEncoder(), 'AgingPP': AgingPPEncoder(), 'SimplePP': SimplePPEncoder(), 'CESAMOEncoder': CESAMOEncoder()} """END: Import encoders""" """START: Import models""" try:
axis=1, inplace=False) stringFeatures = worldcupAllFeatures[[ 'Team1', 'Team2', 'Team1_Continent', 'Team2_Continent', 'Phase' ]].copy() numericFeaturePipeline = Pipeline([ ('selector', DataFrameSelector(list(numericFeatures))), ('imputer', Imputer(strategy="median")), ('std_scaler', StandardScaler()), ]) stringFeaturePipeline = Pipeline([ ('selector', DataFrameSelector(list(stringFeatures))), ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", numericFeaturePipeline), ("cat_pipeline", stringFeaturePipeline), ]) preprocessedFeature = pd.DataFrame( data=full_pipeline.fit_transform(worldcupAllFeatures), index=np.arange(1, 65)) # Split the data into training/testing sets worldcupFeatureTrainingData, testData, worldcupTargetTrainingData, testTarget = \ train_test_split(preprocessedFeature, scoreAsTarget, test_size=0.2, random_state=1)