def invalue_to_similarity(invalue_df, orientation_df): """ invalue_df: converted DataFrame of user inputs orientation_df: DataFrame of all people of that orientation """ # concat input values to orientation df to prep for cosine similarity df = pd.concat([orientation_df, invalue_df]) # ohe df_encoded = OneHotEncoder(use_cat_names=True).fit_transform(df) # make cosine_similarity input (input X) cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T # drop last encoded row (input Y -- data for input X to reference) df_encoded.drop(df_encoded.tail(1).index, inplace=True) # cosine_similarity(X, y) similarity = cosine_similarity(cosine_input, df_encoded) # return top 5 matches top5 = pd.DataFrame(similarity.tolist()[0], columns=['similarity'], index=df_encoded.index).sort_values( by='similarity', ascending=False).iloc[:5] # return top 5 matches in a df with cosine similarities results = pd.DataFrame(columns=cupid.columns) for i in top5.index: results = results.append(pd.DataFrame(cupid.loc[i]).T) return results
def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown)
def process( naive_file, treated_file, metadata_file, resistance_files, outfile, subtype="All", truncate=[41, 235], ): print("reading sequences and metadata") raw_sequences, consensus = reader(naive_file, treated_file, truncate) metadata = read_metadata(metadata_file) print(f"choosing {subtype} subtype(s)") chosen_sequences, dataset_subtypes = choose_subtype( raw_sequences, metadata, subtype) print("Filling with consensus AAs") AA_sequences = fill_consensus_AAs(chosen_sequences, consensus) freqs = get_single_AA_freqs(AA_sequences.drop("label", axis=1)) single_AA_sequences = get_single_AAs(AA_sequences, freqs) print("OneHot encoding") columns_to_encode = single_AA_sequences.columns.drop("label") encoder = OneHotEncoder(use_cat_names=True, handle_unknown="ignore", cols=columns_to_encode.tolist()) encoded_sequences = encoder.fit_transform(single_AA_sequences) print("removing consensus features") features_to_remove = get_features_to_remove(dataset_subtypes) total_sequences = encoded_sequences.drop(columns=features_to_remove, errors="ignore") total_sequences["encoded_label"] = total_sequences["label"].apply({ "treated": 1, "naive": 0 }.get) drms = get_all_DRMs() total_sequences["hasDRM"] = (total_sequences.filter( drms, axis=1).any(axis=1).astype(int)) total_sequences["is_resistant"] = (total_sequences[[ "encoded_label", "hasDRM" ]].any(axis=1).astype(int)) print("getting resistance scores") resistance_scores = get_resistance_scores(resistance_files) print("saving dataset to disk") joined = total_sequences.join(resistance_scores) joined.to_csv(outfile, sep="\t", index=True, header=True)
def assign_cat_scaler(self,) : self.cat_method = self.cat_info.get("method", None) self.cat_cols = self.cat_info.get("cols", []) if self.cat_method is None : self.cat_encoder = Empty() elif self.cat_method == "OrdinalEncoder" : self.cat_encoder = OrdinalEncoder(cols = self.cat_cols) elif self.cat_method == "OneHotEncoder" : self.cat_encoder = OneHotEncoder(cols = self.cat_cols) else : raise NotImplementedError("아직 나머지 구현 안함")
def fit(self, data): ''' fits the catagorical encoder, coecces to a pd data frame, save input and feature names :param data: a pandas data frame, or list :return: nothing, fitted encoder is saved as encoder ''' from category_encoders import OneHotEncoder ohe = OneHotEncoder(return_df=self.return_df, handle_unknown=self.handle_unknown) x = self.replace_infrequent_df(data) self.input_names = x.columns ohe.fit(x) self.encoder = ohe self.feature_names_from_cat_encoder()
def one_hot_encoded_result(df_orig): df = df_orig.copy(deep=True) one_hot_enc = OneHotEncoder(cols=['ordinal_result'], use_cat_names=True) one_hot_cols = one_hot_enc.fit_transform(df[['ordinal_result']]) new_one_hot_col_names = [col[:-2] for col in one_hot_cols.columns] mapping_dict = { old: new for old, new in zip(one_hot_cols.columns, new_one_hot_col_names) } one_hot_cols.rename(columns=mapping_dict, inplace=True) one_hot_cols = one_hot_cols[sorted(one_hot_cols.columns)] df_with_new_cols = pd.concat([df, one_hot_cols], axis=1) return df_with_new_cols
def one_hot_encode(self,data): """ 複数のカテゴリ変数をベクトル化して、それぞれ変換規則を保存する関数です。 ベクトル化したデータセットを返します。 変換規則はenc_dictに保存されています。 :param data: 学習で用いるデータセット(Dataset型の属性dataを受け取る) """ #self.enc_dict={} oe=OneHotEncoder(cols=self.columns,handle_unknown="inpute") oe_data=oe.fit_transform(data) self.model=oe #oe_data=oe_data.ix[:,org_order] return oe_data
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') # get column names for categorical and numerical features categorical_vars = self.X.select_dtypes(include='object').columns numerical_vars = self.X.columns.difference(categorical_vars) ordinal = pd.Index([ 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]) nominal = categorical_vars.difference(ordinal) standard_mapping = { 'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5 } mapping_for_ordinals = [{ 'col': column, 'mapping': standard_mapping } for column in ordinal] x_num = self.X[numerical_vars] x_test_num = self.X_test[numerical_vars] # one hot encode categorical columns one_hot_encoder = OneHotEncoder(use_cat_names=True) label_encoder = OrdinalEncoder(drop_invariant=True, mapping=mapping_for_ordinals, handle_unknown='error') x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal]) x_cat_ord = label_encoder.fit_transform(self.X[ordinal]) x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal]) x_test_cat_ord = label_encoder.transform(self.X_test[ordinal]) self.X = x_num.join(x_cat_ord).join(x_cat_nom) self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom) logging.info(f'#{self._step_index} - DONE!')
def predict(user_input: Dict): user_input1 = create_df(user_input) """Returns a random true or false value""" train, test = train_test_split(df, train_size=0.80, test_size=0.20, stratify=df['project_success'], random_state=42) # select our target target = 'project_success' # make train without our target or id train_features = train.drop(columns=[target]) # make numeric features numeric_features = train_features.select_dtypes(include='number').columns.tolist() # make a cardinality feature to help filter cardinality = train_features.select_dtypes(exclude='number').nunique() # get a list of relevant categorical data categorical_features = cardinality[cardinality <=50].index.tolist() # Combine the lists features = numeric_features + categorical_features X_train = train[features] y_train = train[target] X_test = test[features] y_test = test[target] # print(features) # print(X_train.shape, X_test.shape) lrmodel = Pipeline([ ('ohe', OneHotEncoder(use_cat_names=True)), ('scaler', StandardScaler()), ('impute', SimpleImputer()), ('classifier', LogisticRegressionCV()) ]) lrmodel.fit(X_train, y_train) row = X_test.iloc[[4]] # print(X_train) # print('training accuracy:', lrmodel.score(X_train, y_train)) # print('test accuracy:', lrmodel.score(X_test, y_test)) # if lrmodel.predict(row) == 1: # return 'Your Kickstarter project is likely to succeed!' # else: # return 'Your Kickstarter project is likely to fail.' # print(X_test.head()) # print(user_input) # print(y_test.head()) # print(y_test.iloc[[0]]) if lrmodel.predict(user_input1) == 1: predict = {'predict': 'Your Kickstarter project is likely to succeed!'} user_input.update(predict) return user_input else: predict = {'predict':'Your Kickstarter project is likely to fail.'} user_input.update(predict) return user_input
def fit(self, X, y=None): self._dim = X.shape[1] if self.cols is None: self.cols = get_obj_cols(X) self.dummy_encoder = OneHotEncoder(cols=self.cols, handle_unknown='value', handle_missing='value') self.dummy_encoder = self.dummy_encoder.fit(X) self.mapping = self.generate_mapping(X, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) return self
def preproc_data(data, features): ''' Simple preproc data:* LabelEncoded target * One Hot Encoding cat_features * Clean Nans as .median() * split data on X, y data: pd.DataFrame() cat_features: list() # categorical variables in df ''' # LabelEncoded Target for i in features.items(): if 'target' in i: target_col = i[0] data[target_col] = data[target_col].astype('category').cat.codes y = data[target_col] X = data.drop([target_col], axis=1) cat_features = [] for feature in features.items(): if ('categorical' in feature) and (X[feature[0]].nunique(dropna=False) > 2): cat_features.append(feature[0]) # LabelEncoded Binary Features for feature in X.columns: if (X[feature].nunique(dropna=False) < 3): X[feature] = X[feature].astype('category').cat.codes if len(cat_features) > 0: if feature in cat_features: cat_features.remove(feature) # One Hot Encoding if len(cat_features) > 0: encoder = OneHotEncoder(cols=cat_features, drop_invariant=True) X = encoder.fit_transform(X) # Nans nan_columns = list(X.columns[X.isnull().sum() > 0]) if nan_columns: for nan_column in nan_columns: X[nan_column + 'isNAN'] = pd.isna(X[nan_column]).astype('uint8') X[nan_columns].fillna(X[nan_columns].median(), inplace=True) return (X, y)
def vartypes(self, df, cat_cols): """ Checks which variables in df are categorical and fits One Hot Encoders for each """ for x in df.columns: if is_categorical(df[x]) or x in cat_cols: self.categorical_var_list.append(x) self.ohencoders_dict[x] = OneHotEncoder().fit(df[x])
def train_test_split(dataset, categorical_cols, train_fraction): """ Splits the dataset into a train and a test set :param dataset: data to be split :param categorical_cols: list of the column names of the categorical columns (previously identified automatically) :param train_fraction: portion of dataset to be used as train set :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set] """ dataset_encoded = OneHotEncoder(cols=categorical_cols, use_cat_names=True).fit_transform(dataset) train_len = int(len(dataset.index) * train_fraction) train_set = dataset.sample(n=train_len, random_state=1) train_set_encoded = dataset_encoded.loc[train_set.index].reset_index( drop=True) test_set = dataset.drop(train_set.index).reset_index(drop=True) test_set_encoded = dataset_encoded.drop( train_set.index).reset_index(drop=True) return train_set.reset_index( drop=True), train_set_encoded, test_set, test_set_encoded
def OneHot_Encoding(self, handle_missing='indicator', handle_unknown='indicator'): """ one-hot编码,其可以将具有n_categories个可能值的一个分类特征转换为n_categories个二进制特征,其中一个为1,所有其他为0 :param handle_missing: 默认value,缺失值用全0替代;indicator,增加缺失值一列 :param handle_unknown: 默认value,未知值用全0替代;indicator,增加未知值一列 :return: """ self.encoder = OneHotEncoder(cols=self.cols, handle_missing=handle_missing, handle_unknown=handle_unknown)
def train_test_split(dataset, train_fraction): """ Splits the dataset into a train and a test set :param dataset: data to be split :param categorical_cols: list of the column names of the categorical columns (previously identified automatically) :param train_fraction: portion of dataset to be used as train set :return: a list [train set, one-hot-encoded train set, test set, one-hot-encoded test set] """ #### Default - set string values as categorial except target column #### ########################################################################## categorical_cols = [] for (columnName, columnData) in dataset.iteritems(): for value in columnData.values: if type(value) is str: categorical_cols.append(columnName) break if (categorical_cols[-1] == "CLASS"): categorical_cols = np.delete(categorical_cols, -1) ## AR: improve categorial selction dataset_encoded = OneHotEncoder(cols=categorical_cols, use_cat_names=True).fit_transform(dataset) if (train_fraction == 1): return dataset_encoded, dataset_encoded, dataset_encoded, dataset_encoded train_len = int(len(dataset.index) * train_fraction) train_set = dataset.sample(n=train_len, random_state=1) train_set_encoded = dataset_encoded.loc[train_set.index].reset_index( drop=True) test_set = dataset.drop(train_set.index).reset_index(drop=True) test_set_encoded = dataset_encoded.drop( train_set.index).reset_index(drop=True) return train_set.reset_index( drop=True), train_set_encoded, test_set, test_set_encoded
def fit_onehot(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the One-hot encoder by fitting it through the given DataFrame NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_onehot` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) drop_cols = ["{}_nan".format(col) for col in cols] encoder = OneHotEncoder(cols=cols, use_cat_names=True) encoder = encoder.fit(df) result_df = encoder.transform(df) for drop_col in drop_cols: if drop_col in result_df.columns: result_df = result_df.drop(columns=[drop_col]) model = { "encoder": encoder, "cols": cols, "na_value": na_value, "drop_cols": drop_cols, } return result_df, model
def encode_low_cardinality_categorical_df(dataframe, fit=False): """ Encode low cardinality categorical features using OneHot Encoding and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'low_card_categorical_encoder') else: encoder = unpickle_obj('low_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def CatEncoder(X, cat_cols, tags, estimator_name, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? #TODO: handle multiclass / Regression if isinstance(X, pd.DataFrame) and isinstance(cat_cols[0], str): large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] elif isinstance(X, pd.DataFrame): large_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X.iloc[:,col])) <= large_threshold] else: large_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) > large_threshold] small_cardinal_cats = [col for col in cat_cols if len(np.unique(X[:,col])) <= large_threshold] enc_pipe = None cat_enc_types = ["target", "binary", "catboost"] if len(small_cardinal_cats) > 0: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if len(large_cardinal_cats) > 0: if (objective_type == "classification" and n_classes == 1): cat_enc_types.append("woe") cat_enc_type = trial.suggest_categorical(estimator_name + " cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 6 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def one_hot_encode(columns: Union[List[str], str]) -> CategoryEncoder: """Performs simple one-hot encoding An alias to stl.category_encode(OneHotEncoder(), columns). Args: columns: list of columns to be encoded. Treats string as a list of length 1 Returns: A feature constructor returning a concatenation of one-hot encoding of each column. Examples: >>> stl.one_hot_encode(['Sex', 'Embarked']) >>> stl.one_hot_encode('Embarked') """ enc = OneHotEncoder() return category_encode(enc, columns=columns, targets=None)
def CatEncoder(X, cat_cols, tags, objective_type, trial, n_classes, random_state): if tags["handles categorical"] == False: large_threshold = 6 #TODO: handle numpy arrays with categorical? large_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() > large_threshold] small_cardinal_cats = [col for col in X[cat_cols].columns if X[col].nunique() <= large_threshold] enc_pipe = None cat_enc_types = ["binary", "catboost", "woe", "target"] if small_cardinal_cats is not None: enc_pipe = add_to_pipe(enc_pipe, "ohe", OneHotEncoder(cols=small_cardinal_cats, drop_invariant=True)) if large_cardinal_cats is not None: if (objective_type == "classification" and n_classes > 2): #multiclass cat_enc_types = ["binary"] cat_enc_type = trial.suggest_categorical("cat_enc_type", cat_enc_types) if cat_enc_type == "binary": # mapping = get_mapping(X, large_cardinal_cats) enc = BinaryEncoder(cols=large_cardinal_cats, drop_invariant=True, # mapping=mapping ) elif cat_enc_type == "woe": enc = WOEEncoder(cols=large_cardinal_cats, drop_invariant=True) elif cat_enc_type == "target": min_samples_leaf = 10 # TODO: calculate percentage or something else enc = TargetEncoder(min_samples_leaf=min_samples_leaf, cols=large_cardinal_cats, drop_invariant=True) else: # catboost enc = CatBoostEncoder(cols=large_cardinal_cats, drop_invariant=True, random_state=random_state) # TODO: replace SEED # TODO: permute to the dataset beforehand enc_pipe = add_to_pipe(enc_pipe, cat_enc_type + "_encoder", enc) return enc_pipe
def fit_model(model, df, target, numeric_columns, categorical_columns, param_grid): x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42) imputer = IterativeImputer(max_iter=30, random_state=42) scaler = MinMaxScaler() frequent = SimpleImputer(missing_values=np.NaN, strategy='most_frequent') onehot = OneHotEncoder() pca = PCA(n_components=round(x_train.shape[1]*0.8)) preprocess = make_column_transformer( (make_pipeline(imputer, scaler), numeric_columns), (make_pipeline(frequent, onehot), categorical_columns) ) pipe = make_pipeline(preprocess, pca, GridSearchCV(model, param_grid=param_grid, verbose=10)) return pipe.fit(x_train, y_train)
def generate_model(X, y, prefix, param): ''' Runs MLP with softmax output activation method, which allows multiclass classification. We will need to test All VS One to check performance.''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40) print(X_train.shape); print(X_test.shape) ppl_model1 = Pipeline(steps=[ ('one-hot encoder', OneHotEncoder()), ('mlp', MLPClassifier(hidden_layer_sizes=(5,), activation='relu', solver='lbfgs', alpha=1e-5, max_iter=500, random_state=1, verbose=True)) ]) ppl_model1.out_activation_ = 'softmax' ppl_model1.fit(X_train, y_train) predict_train = ppl_model1.predict(X_train) predict_test = ppl_model1.predict(X_test) print(confusion_matrix(y_train, predict_train)) print(classification_report(y_train, predict_train)) print(confusion_matrix(y_test, predict_test)) print(classification_report(y_test, predict_test)) # print("weights between input and first hidden layer:") # print(ppl_model1[1].coefs_[0]) # print("\nweights between first hidden and second hidden layer:") # print(ppl_model1[1].coefs_[1]) # # print("Bias values for first hidden layer:") # print(ppl_model1[1].intercepts_[0]) # print("\nBias values for second hidden layer:") # print(ppl_model1[1].intercepts_[1]) #sklite_file = "draft/mlp_sweep_model.json" #lazy = LazyExport(ppl_model1) nameout = 'output/'+prefix+'_model'+param.curr_datetime+'.joblib' dump(ppl_model1[1], nameout)
def categoricals(self, model_name='onehot_model.pkl', cols=None, owr=False, model_bin=None): """Onehot encoder on categoricals.""" self.log('Apply onehot encoder on categorical') model_path = os.path.join(self.model_path, model_name) if cols is None: cols = self.data.cat_cols if ((not os.path.isfile(model_path)) or owr) and (model_bin is None): self.log('\nTrain model\n') model_bin = OneHotEncoder( cols=cols, use_cat_names=True, handle_unknown='error', drop_invariant=False, impute_missing=False) model_bin.fit(self.data._X) self.data._X = model_bin.transform(self.data._X) setattr(model_bin, 'data_schema', self.data._X.columns.values) # Save model if self.auto_save: joblib.dump(model_bin, model_path) elif os.path.isfile(model_path): # File exists/prediction: model_bin = joblib.load(model_path) self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') else: # Prediction in pipeline self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') return model_bin
def fit_model(model, df, target, numeric_columns, categorical_columns, param_grid): x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.10, random_state=42) imputer = IterativeImputer(max_iter=30, random_state=42) scaler = MinMaxScaler() frequent = SimpleImputer(missing_values=np.NaN, strategy='most_frequent') onehot = OneHotEncoder() #smt = SMOTETomek('auto') # over_samp = SMOTE(sampling_strategy={0: count_class_0}) # under_samp = NearMiss(sampling_strategy={1: count_class_1}) preprocess = make_column_transformer( (make_pipeline(imputer, scaler), numeric_columns), (make_pipeline(frequent, onehot), categorical_columns) ) pipe = make_pipeline(preprocess, GridSearchCV(model, param_grid=param_grid, verbose=10)) return pipe.fit(x_train, y_train)
def defineBestModelPipeline(df, target, categorical_columns, numeric_columns): # Splitting original data into Train and Test x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.1, random_state=42) y_train = y_train.to_numpy( ) # Transforming training targets into numpy arrays y_test = y_test.to_numpy() # Transforming test targets into numpy arrays # # If desired, we can balance training classes using one of the functions below # # Obtaining balanced data for modeling using Random Under Sampling x_train, y_train = balancingClassesRus(x_train, y_train) # # Obtaining balanced data for modeling using SMOTEENN #x_train, y_train = balancingClassesSmoteenn(x_train, y_train) # # Obtaining balanced data for modeling using SMOTE #x_train, y_train = balancingClassesSmote(x_train, y_train) # 1st -> Numeric Transformers # Here, we are creating different several different data transformation pipelines # to be applied in our numeric features numeric_transformer_1 = Pipeline( steps=[('imp', IterativeImputer(max_iter=30, random_state=42) ), ('scaler', MinMaxScaler())]) numeric_transformer_2 = Pipeline( steps=[('imp', IterativeImputer(max_iter=20, random_state=42) ), ('scaler', StandardScaler())]) numeric_transformer_3 = Pipeline( steps=[('imp', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())]) numeric_transformer_4 = Pipeline( steps=[('imp', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # 2nd -> Categorical Transformer # Despite my option of not doing it, you can also choose to create different # data transformation pipelines for your categorical features. categorical_transformer = Pipeline( steps=[('frequent', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(use_cat_names=True))]) # 3rd -> Combining both numerical and categorical pipelines # Here, we are creating different ColumnTransformers, each one with a different numerical transformation data_transformations_1 = ColumnTransformer(transformers=[( 'num', numeric_transformer_1, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_2 = ColumnTransformer(transformers=[( 'num', numeric_transformer_2, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_3 = ColumnTransformer(transformers=[( 'num', numeric_transformer_3, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) data_transformations_4 = ColumnTransformer(transformers=[( 'num', numeric_transformer_4, numeric_columns), ('cat', categorical_transformer, categorical_columns)]) # And finally, we are going to apply these different data transformations to RandomSearchCV, # trying to find the best imputing strategy, the best feature engineering strategy # and the best model with it's respective parameters. # Below, we just need to initialize a Pipeline object with any transformations we want, on each of the steps. pipe = Pipeline(steps=[ ( 'data_transformations', data_transformations_1 ), # Initializing data transformation step by choosing any of the above ( 'feature_eng', PCA() ), # Initializing feature engineering step by choosing any desired method ('clf', SVC()) ]) # Initializing modeling step of the pipeline with any model object #memory='cache_folder') -> Used to optimize memory when needed # Now, we define the grid of parameters that RandomSearchCV will use. It will randomly chose # options for each step inside the dictionaries ('data transformations', 'feature_eng', 'clf' # and 'clf parameters'). In the end of it's iterations, RandomSearchCV will return the best options. params_grid = [{ 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [KNeighborsClassifier()], 'clf__n_neighbors': stats.randint(1, 30), 'clf__metric': ['minkowski', 'euclidean'] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [LogisticRegression()], 'clf__penalty': ['l1', 'l2'], 'clf__C': stats.uniform(0.01, 10) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [SVC()], 'clf__C': stats.uniform(0.01, 1), 'clf__gamma': stats.uniform(0.01, 1) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [DecisionTreeClassifier()], 'clf__criterion': ['gini', 'entropy'], 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 5)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [RandomForestClassifier()], 'clf__n_estimators': stats.randint(10, 175), 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 5)], 'clf__random_state': stats.randint(1, 49) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ExtraTreesClassifier()], 'clf__n_estimators': stats.randint(10, 150), 'clf__max_features': [None, "auto", "log2"], 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [GradientBoostingClassifier()], 'clf__n_estimators': stats.randint(10, 100), 'clf__learning_rate': stats.uniform(0.01, 0.7), 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [LGBMClassifier()], 'clf__n_estimators': stats.randint(1, 100), 'clf__learning_rate': stats.uniform(0.01, 0.7), 'clf__max_depth': [None, stats.randint(1, 6)] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [XGBClassifier()], 'clf__n_estimators': stats.randint(5, 125), 'clf__eta': stats.uniform(0.01, 1), 'clf__max_depth': [None, stats.randint(1, 6)], 'clf__gamma': stats.uniform(0.01, 1) }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ StackingClassifier(estimators=[ ('svc', SVC(C=1, gamma=1)), ('rf', RandomForestClassifier(max_depth=7, max_features=None, n_estimators=60, n_jobs=-1, random_state=42)), ('xgb', XGBClassifier(eta=0.6, gamma=0.7, max_depth=None, n_estimators=30)) ], final_estimator=LogisticRegression(C=1)) ] }, { 'data_transformations': [ data_transformations_1, data_transformations_2, data_transformations_3, data_transformations_4 ], 'feature_eng': [ None, PCA(n_components=round(x_train.shape[1] * 0.9)), PCA(n_components=round(x_train.shape[1] * 0.8)), PCA(n_components=round(x_train.shape[1] * 0.7)), PolynomialFeatures(degree=1), PolynomialFeatures(degree=2), PolynomialFeatures(degree=3) ], 'clf': [ VotingClassifier(estimators=[ ('gbt', GradientBoostingClassifier(learning_rate=0.8, max_depth=None, n_estimators=30)), ('lgbm', LGBMClassifier(n_estimators=30, learning_rate=0.6, max_depth=None)), ('xgb', XGBClassifier(eta=0.8, gamma=0.8, max_depth=None, n_estimators=40)) ], voting='soft') ] }] # Now, we fit a RandomSearchCV to search over the grid of parameters defined above metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'] best_model_pipeline = RandomizedSearchCV(pipe, params_grid, n_iter=500, scoring=metrics, refit='accuracy', n_jobs=-1, cv=5, random_state=42) best_model_pipeline.fit(x_train, y_train) # At last, we check the final results print( "\n\n#---------------- Best Data Pipeline found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[0]) print( "\n\n#---------------- Best Feature Engineering technique found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[1]) print( "\n\n#---------------- Best Classifier found in RandomSearchCV ----------------#\n\n", best_model_pipeline.best_estimator_[2]) print( "\n\n#---------------- Best Estimator's average Accuracy Score on CV (validation set) ----------------#\n\n", best_model_pipeline.best_score_) return x_train, x_test, y_train, y_test, best_model_pipeline
'GuaranteeGroup', 'FamilySizeBin','IsBoy', 'IsFemale', 'FareLow', 'DataPartition', 'PassengerId', 'Survived']] ########################################################################### # Split data into train and test # ########################################################################### trainData = fullData.loc[fullData.DataPartition == 'train'] testData = fullData.loc[fullData.DataPartition == 'test'] ########################################################################### # One hot encode # ########################################################################### # https://github.com/scikit-learn-contrib/categorical-encoding # http://contrib.scikit-learn.org/categorical-encoding/onehot.html from category_encoders import OneHotEncoder categories = list(set(trainData.select_dtypes(['category']).columns)) target = trainData.Survived enc = OneHotEncoder(cols=categories,return_df = 1, handle_unknown = 'ignore').fit(trainData, target) trainData = enc.transform(trainData) testData = enc.transform(testData) ########################################################################### # Drop multi collinear levels and no longer required # ########################################################################### dropColumns = ['DataPartition'] trainData = trainData.drop(columns=dropColumns) testData = testData.drop(columns=dropColumns) testData = testData.drop(columns='Survived') ########################################################################### # Start h2o cloud # ########################################################################### import h2o h2o.init() h2o.remove_all # clean slate, in case cluster was already running
def train_pipeline(X, y): """ Builds and trains a machine learning pipeline """ numerical_col = [ 'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions', 'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews', 'Hits', 'Created to arrival' ] categorical_col = [ 'Language', 'Website', 'Enquiry type', 'Enquiry status', 'Client budget', 'Country code', 'GA source', 'GA medium', 'Device', 'Created month' ] binary_col = [ 'Flights booked', 'User agent', 'User repeat', 'User referral' ] text_col = ['Click path', 'GA keyword'] target = ['is booking'] # Numerical pipeline numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col), SimpleImputer(strategy="median"), StandardScaler()) # Categorical pipeline categorical_pipeline = make_pipeline( ColumnSelector(cols=categorical_col), SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder()) # Binary pipeline binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col), SimpleImputer(strategy="most_frequent"), BinaryEncoder()) # Text pipelines text_pipeline_1 = make_pipeline( ColumnSelector(cols=['Click path']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), HashingVectorizer(n_features=2**11), DenseTransformer()) text_pipeline_2 = make_pipeline( ColumnSelector(cols=['GA keyword']), SimpleImputer(strategy='constant', fill_value=''), ReshapeTransformer(), TfidfVectorizer(), DenseTransformer()) # Pipeline union processing_pipeline = make_union(numerical_pipeline, categorical_pipeline, binary_pipeline, text_pipeline_1, text_pipeline_2) estimator = BalancedRandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=60, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=472, n_jobs=1, oob_score=False, random_state=None, replacement=False, sampling_strategy='auto', verbose=0, warm_start=False) predictive_pipeline = make_pipeline(processing_pipeline, estimator) predictive_pipeline.fit(X, y) return predictive_pipeline
st.subheader(f"Breaking Down {y_axis} by: {x_axis}") if chart_type == 'line': st.line_chart(df.groupby(x_axis)[y_axis].mean()) elif chart_type == 'bar': st.bar_chart(df.groupby(x_axis)[y_axis].mean()) elif chart_type == 'box': chart = sns.catplot(x=x_axis, y=y_axis, kind='box', aspect=2, data=df) if df[x_axis].nunique() > 8: chart.set_xticklabels(rotation=90) st.pyplot(chart) if page in ['Model Explorer', 'Causal Impact']: st.cache() pipe = make_pipeline(OneHotEncoder(use_cat_names=True), xgb.XGBRegressor()) st.cache() X_train, X_val, y_train, y_val = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2, random_state=1985) if page == 'Model Explorer': num_rounds = st.sidebar.number_input('Number of Boosting Rounds', min_value=100, max_value=5000, step=100) tree_depth = st.sidebar.number_input('Tree Depth', min_value=2, max_value=8, step=1, value=3) learning_rate = st.sidebar.number_input('Learning Rate', min_value=.001, max_value=1.0, step=.05, value=0.1) validation_size = st.sidebar.number_input('Validation Size',
def aggregate_per_time_interval(date_interval): ### Importing customer_data = pd.read_csv('Data/olist_customers_dataset.csv') geolocation_data = pd.read_csv('Data/olist_geolocation_dataset.csv') order_items_data = pd.read_csv('Data/olist_order_items_dataset.csv') order_payments_data = pd.read_csv('Data/olist_order_payments_dataset.csv') order_reviews_data = pd.read_csv('Data/olist_order_reviews_dataset.csv') olist_order_data = pd.read_csv('Data/olist_orders_dataset.csv') olist_products_data = pd.read_csv('Data/olist_products_dataset.csv') olist_sellers_data = pd.read_csv('Data/olist_sellers_dataset.csv') olist_product_category_data = pd.read_csv( 'Data/product_category_name_translation.csv') ### Converts column of interest to datetime format olist_order_data['order_purchase_timestamp'] = pd.to_datetime( olist_order_data['order_purchase_timestamp']) ### Keeps dates that are between the given date limits mask = (olist_order_data['order_purchase_timestamp'] >= date_interval[0]) & (olist_order_data['order_purchase_timestamp'] < date_interval[1]) olist_order_data = olist_order_data[mask] ### Rest of function is the same as in first notebook of the project ### Olist_products_dataset merge to get product category name in english olist_products_data = olist_products_data.merge( olist_product_category_data, how='left', on='product_category_name') ### Merge order items dataset with products dataset order_items_data = order_items_data.merge(olist_products_data, how='left', on='product_id') ### Count number of occurrences for each order ID count = order_items_data.groupby('order_id').count().iloc[:, 0].rename( 'n_items per order') ### Numeric data will be aggregated by mean num_order_items_data = pd.concat([ order_items_data['order_id'], order_items_data.select_dtypes('float64') ], axis=1) num_order_items_data = num_order_items_data.groupby('order_id').mean() ### Aggregate each order's products category names by its most frequent value cat_order_items_data = order_items_data[[ 'order_id', 'product_category_name_english' ]].groupby('order_id').agg(lambda g: g.value_counts().index[0] if np.any(g.notnull()) else np.nan) order_items_data = pd.concat( [count, num_order_items_data, cat_order_items_data], axis=1) olist_order_data = olist_order_data.merge(order_items_data, how='left', on='order_id') ### Number of payments ###1. Count the number ### Count number of payments per order count = order_payments_data.groupby('order_id').count().iloc[:, 0].rename( 'n_payments per order') ### One hot encode payment type feature enc = OneHotEncoder(cols=['payment_type'], use_cat_names=True) order_payments_data = enc.fit_transform(order_payments_data) order_payments_data = order_payments_data.drop('payment_type_not_defined', axis=1) order_payments_data = order_payments_data.groupby('order_id').mean() order_payments_data = pd.concat([order_payments_data, count], axis=1) olist_order_data = olist_order_data.merge(order_payments_data, how='left', on='order_id') ### Number of reviews per order count = order_reviews_data.groupby('order_id').count().iloc[:, 0].rename( 'n_reviews per order').astype('float64') order_reviews_data = order_reviews_data[['order_id', 'review_score' ]].groupby('order_id').mean() order_reviews_data = pd.concat([count, order_reviews_data], axis=1) olist_order_data = olist_order_data.merge(order_reviews_data, how='left', on='order_id') ### Merging customer table with order tables customer_data = customer_data.merge(olist_order_data, how='inner', on='customer_id') ### Cutomer data aggregation count = customer_data.groupby( 'customer_unique_id').count().iloc[:, 0].rename('n_orders per customer') ### Numeric features aggregated by mean numeric_customer_data = pd.concat([ customer_data.select_dtypes('float64'), customer_data['customer_unique_id'] ], axis=1) numeric_customer_data = numeric_customer_data.groupby( 'customer_unique_id').mean() ### Categorical features aggregated by most frequent value cat_customer_data = customer_data[[ 'customer_unique_id', 'product_category_name_english' ]].groupby('customer_unique_id').agg(lambda g: g.value_counts().index[0] if np.any(g.notnull()) else np.nan) customer_data = pd.concat( [count, numeric_customer_data, cat_customer_data], axis=1) return customer_data
def main(): # Preprocess the data # start your code here # Load data data = pd.read_csv("bank.csv") # Fix typo in column name data.rename(columns={"subcribed": "subscribed"}, inplace=True) # Encoding features data = data.replace({"yes": 1, "no": 0}) ohe = OneHotEncoder( cols=["job", "marital", "education", "contact", "month", "poutcome"], use_cat_names=True, return_df=True, ) data = ohe.fit_transform(data) # print(data.head()) # Get features and target X = data.drop(columns=["subscribed"]) y = data["subscribed"] # Split training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100) # end your code here # print( # "\n\nDecision Tree: -------------------------------------------------------------------------\n\n" # ) # # start your code here # tree_classifier = DecisionTreeClassifier( # max_depth=4, # max_leaf_nodes=4, # random_state=100, # ) # tree_classifier.fit(X_train, y_train) # y_pred_tree = tree_classifier.predict(X_test) # evaluate(y_test, y_pred_tree) # # feature_imp_tree = pd.Series( # # tree_classifier.feature_importances_, index=X_train.columns # # ).sort_values(ascending=False)[:10] # # print(feature_imp_tree) # # plt.figure(figsize=(20, 10)) # # plot_tree( # # tree_classifier, # # feature_names=X_train.columns, # # class_names=["no", "yes"], # # rounded=True, # # ) # # plt.savefig("decision_tree.svg", bbox_inches="tight") # # plt.show() # # end your code here # print( # "\n\nRandom Forest: -------------------------------------------------------------------------\n\n" # ) # # start your code here # rf_classifier = RandomForestClassifier( # # bootstrap=False, # criterion="entropy", # max_depth=9, # max_leaf_nodes=21, # min_samples_leaf=5, # random_state=100, # ) # rf_classifier.fit(X_train, y_train) # y_pred_rf = rf_classifier.predict(X_test) # evaluate(y_test, y_pred_rf) # feature_imp_rf = pd.Series( # rf_classifier.feature_importances_, index=X_train.columns # ).sort_values(ascending=False)[:10] # print(feature_imp_rf) # # end your code here print( "\n\nXGBoost: -------------------------------------------------------------------------\n\n" ) # start your code here xgb_classifier = xgb.XGBClassifier( objective="binary:logistic", learning_rate=0.1, max_depth=3, min_child_weight=5, use_label_encoder=False, colsample_bytree=0.3, ) xgb_classifier.fit(X_train, y_train) y_pred_xgb = xgb_classifier.predict(X_test) evaluate(y_test, y_pred_xgb)
import os print(os.listdir("../input")) train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') train.head() train.info() test_id = test['id'] # save for submission del train['id'] del test['id'] train['type'].unique(), train['color'].unique() sns.violinplot(x='bone_length', y='type', data=train) sns.boxplot(x='hair_length', y='type', data=train) sns.pairplot(train) from category_encoders import OneHotEncoder encoder = OneHotEncoder(cols=['color'], use_cat_names=True) train = encoder.fit_transform(train) test = encoder.fit_transform(test) train.head() from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(train['type']) print(encoder.classes_) train['type_no'] = encoder.transform(train['type']) train.head() sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train))