def _plotScores(scores, paramGrid, to_file, scoreLabel=None, greater_is_better=True, vrange=None, cmap="YlOrRd"): keys = sorted(list(paramGrid)[0].keys()) uniqParams = dict() order = dict() for k in keys: order[k] = np.unique([str(params[k]) for params in list(paramGrid)], return_index=True)[1] uniqParams[k] = [params[k] for params in np.asarray(list(paramGrid))[sorted(order[k])]] keysToPlot = list() for k in keys: if len(uniqParams[k]) > 1: keysToPlot.append(k) for k in keys: if k not in keysToPlot: uniqParams.pop(k, None) numDim = len(keysToPlot) if numDim > 3: printlog("Too many dimensions to plot.") elif numDim == 3: _plot3DGrid(scores, uniqParams, keysToPlot, scoreLabel, greater_is_better, vrange, cmap, to_file) elif numDim == 2: _plot2DGrid(scores, uniqParams, keysToPlot, scoreLabel, greater_is_better, vrange, cmap, to_file) elif numDim == 1: _plot1DGrid(scores, uniqParams, scoreLabel, vrange, to_file) else: printlog("No parameters that vary in the grid")
def varyDataset(ds, save_path): classed_feature_preffix = [[ '^als_d7_id_', '^als_d15_id_', '^als_m1_id_', '^als_m3_id_', '^als_m6_id_', '^als_m12_id_', '^als_fst_id_', '^als_lst_id_' ], [ '^als_d7_cell_', '^als_d15_cell_', '^als_m1_cell_', '^als_m3_cell_', '^als_m6_cell_', '^als_m12_cell_', '^als_fst_cell_', '^als_lst_cell_' ]] printlog('class 5 - value padding: larger/smaller') ds_t = pd.read_csv(ds, encoding='gb18030', header=0, index_col=0) for i, (id_fc, cell_fc) in enumerate( zip( Preprocess.pattern_to_feature(ds_t, classed_feature_preffix[0], encoding='gb18030'), Preprocess.pattern_to_feature(ds_t, classed_feature_preffix[1], encoding='gb18030'))): for id_f, cell_f in zip(id_fc, cell_fc): ds_t.insert(loc=ds_t.columns.get_loc(id_f), column=id_f.replace('id', 'large'), value=ds_t[[id_f, cell_f]].apply(np.max, axis=1)) ds_t.insert(loc=ds_t.columns.get_loc(id_f), column=id_f.replace('id', 'small'), value=ds_t[[id_f, cell_f]].apply(np.min, axis=1)) printlog('class 5 - value padding finished {} and {}'.format( classed_feature_preffix[0][i], classed_feature_preffix[1][i])) ds_t.to_csv(save_path, encoding='gb18030')
def outlier_data(ds, file_path=None, features=None, measure='std', threshold=3, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.Dataframe, dataset or dataset path file_path(default None): str, if not None, the result is saved in path features(default None): list of str/np.array/pd.Series, if not None, only corresponding features will be checked ''' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds series_outlier = pd.Series() columns = features if features else ds.columns if measure == 'std': for feature in ds.columns: single_feature_outlier = ds[feature][ np.abs(ds[feature] - ds[feature].mean()) > threshold * ds[feature].std()].sum() series_outlier = series_outlier.append(pd.Series( [single_feature_outlier]), ignore_index=True) printlog('OUTLIER: {}/{} features(threshold: {}, measure: {})'. format((series_outlier > 0).values.sum(), series_outlier.size, threshold, measure)) if file_path: series_outlier.to_csv(file_path, encoding=encoding, header=True)
def drop_sparse(ds, features, threshold, save_path=None, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.DataFrame, dataset threshold: int, threshold for least notna samples of feature ''' printlog('Preprocess.drop_sparse: started.') ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = ds.columns if features == 'all' else features features = [features] if isinstance(features, str) else features columns_todrop = ds[features].loc[:, ds[features].notna().sum( axis=0) <= threshold] ds = ds.drop(columns=columns_todrop.columns) if save_path: ds.to_csv(save_path, encoding=encoding) printlog('Preprocess.drop_sparse: finished. {} features dropped.'.format( columns_todrop.shape[1]))
def generateExperienceFeature(ds): printlog( '-----------------------------------generate experience feature-----------------------------------' ) ds_temp = pd.read_csv(ds, encoding='gb18030', header=0, index_col=0) series_t = pd.Series(ds_temp['cons_tot_m12_visits'], ds_temp.index) series_t[series_t.between(-99.001, -0.001)] = -99 series_t[series_t.between(-0.001, 500.001)] = 500 series_t[series_t.between(500.001, 1000.001)] = 1000 series_t[series_t.between(1000.001, 1500.001)] = 1500 series_t[series_t.between(1500.001, 900000)] = 9000 ds_temp.loc[:, 'cons_tot_m12_visits'] = series_t series_t = pd.Series(data=-1, index=ds_temp.index) series_t[(ds_temp['pd_id_gender'] == 0) & (ds_temp['pd_id_apply_age'].between(-99.001, 30.001))] = 0 series_t[(ds_temp['pd_id_gender'] == 0) & (ds_temp['pd_id_apply_age'].between(30.001, 60.001))] = 1 series_t[(ds_temp['pd_id_gender'] == 0) & (ds_temp['pd_id_apply_age'].between(60.001, 999.001))] = 2 series_t[(ds_temp['pd_id_gender'] == 1) & (ds_temp['pd_id_apply_age'].between(-0.001, 24.001))] = 3 series_t[(ds_temp['pd_id_gender'] == 1) & (ds_temp['pd_id_apply_age'].between(24.001, 35.001))] = 4 series_t[(ds_temp['pd_id_gender'] == 1) & (ds_temp['pd_id_apply_age'].between(35.001, 45.001))] = 5 series_t[(ds_temp['pd_id_gender'] == 1) & (ds_temp['pd_id_apply_age'].between(45.001, 999.001))] = 2 if 'pd_gender_age' not in ds_temp.columns: ds_temp.insert(ds_temp.columns.size - 1, 'pd_gender_age', series_t) else: ds_temp.loc[:, 'pd_gender_age'] = series_t ds_temp.to_csv(ds, encoding='gb18030')
def sparse_feature(ds, features=None, file_path=None, measure='std', threshold=0.01, largeset=False, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: pandas.Dataframe, numpy.ndarray or str of dataset path shaped [n of samples, n of features] features(default None): str/list of str, if not None, only corresponding features in ds will be checked file_path(default None): str, if not None, result is saved at the path measure(default 'std'): str, either 'mean' or 'std', deciding the calculation of feature performance (mean threshold are compared with features' absolute means) threshold(default 0.01): float, threshold for deciding whether a feature is sparse largeset(default Faslse): boolean, whether to apply low-memory method for sparse detection encoding(default 'utf-8'): str, encoding of dataset header(default 0): int, works on pandas.read_csv() (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0 ): int, works on pandas.read_csv() (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) # Return: pandas.Series, boolean values of shape [n of features] (true represent the feature is not sparse; False represent sparse) ''' assert measure in [ 'mean', 'std' ], 'EDA.sparse_feature: parameter measure should be either \'mean\' or \'std\', {} is given'.format( measure) if type(ds) == str: ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if measure == 'mean': insparse_feature = ((ds != 0).abs().mean() > threshold) elif measure == 'std': insparse_feature = (ds != 0).std() > threshold printlog( 'SPARSE: {}/{} features(threshold: {}, measure: {})'.format( (insparse_feature == True).sum(), insparse_feature.size, threshold, measure)) if file_path: insparse_feature.to_csv(file_path, encoding=encoding, header=True)
def gridTrainValidSelection(estimator, grid, X_train, y_train, X_valid, y_valid, metric=roc_auc_score, scoreLabel='ROC AUC', to_file=None, showPlot=True, n_jobs=-1, verbose=10, predict_proba=True, greater_is_better=True, vrange=None, cmap=plt.cm.Blues): paramGrid = ParameterGrid(grid) printlog("-------------FITTING MODELS-------------") models = fitModels(estimator, paramGrid, X_train, y_train, n_jobs, verbose) printlog("-------------SCORING MODELS-------------") scores = scoreModels(models, X_valid, y_valid, metric, predict_proba, n_jobs, verbose) if showPlot: _plotScores(scores, paramGrid, to_file, scoreLabel, greater_is_better, vrange, cmap) return getBestModel(models, scores, greater_is_better), getBestScore(scores, greater_is_better), models, scores
def shape(ds, largeset=False, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: pd.Dataframe or str of dataset path ''' if type(ds) == str: ds = pd.read_csv(ds, encoding=encoding, header=0, index_col=0) printlog('SAMPLE(row): {}'.format(ds.shape[0])) printlog('FEATURE/LABEL(column): {}'.format(ds.shape[1]))
def na_data(ds, file_path=None, save_graph=True, features=None, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.Dataframe, dataset or dataset path file_path(default None): str, if not None, result of checking is saved at the path save_graph(default True): boolean, whether save result as graph or csv features(default None): list of str/np.array/pd.Series, if not None, only the corresponding features will be checked encoding(default 'utf-8'): str, encoding of dataset header(default 0): int, works on pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0): int, works in pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) ''' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds columns = features if features else ds.columns series_na = pd.Series() for feature in columns: series_na = series_na.append(pd.Series([ds[feature].isna().sum()]), ignore_index=True) printlog('NA: {}/{} features(totally {} data)'.format( (series_na > 0).sum(), series_na.size, series_na.sum())) if file_path and save_graph: assert re.search(r'.png', file_path) or re.search( r'.jpg', file_path ) or re.search( r'.jpeg', file_path ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix' sns.distplot(series_na, kde=False) plt.title('Na data in features') plt.xlabel('feature count') plt.ylabel('Na data count') plt.savefig(file_path) plt.close() if file_path and not save_graph: assert re.search( r'.csv', file_path ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix' series_na.to_csv(file_path, encoding=encoding, header=True)
def feature_padding_on_hit_rate(ds, features, preffix_patterns, encoding='utf-8', header=0, index_col=0): ## get suffix of features in given class classed_class_features = Preprocess.pattern_to_feature(ds, preffix_patterns, encoding=encoding) ds = pd.read_csv( ds, encoding='gb18030', header=header, index_col=index_col) if isinstance(ds, str) else ds ## tmp: class suffix inflattened tmp = [ list(map(lambda fc, pf=preffix: fc[len(pf) - 1:], feature_class)) for preffix, feature_class in zip( preffix_patterns, classed_class_features) ] class_suffix = [] for t in tmp: class_suffix.extend(t) ## class_suffix: class suffix unique flattened class_suffix = list(set(class_suffix)) # print('feature_padding: preffix_patterns = {}'.format(preffix_patterns)) ## get features with mutually exclusive suffixs mut_exc_feature = [] for suffix in class_suffix: tmp_hit_rate = 0 tmp_output_feature = '' for i, t in enumerate(tmp): if suffix in t: tmp_feature = preffix_patterns[i][1:] + suffix tmp_feature_hit_rate = ds[tmp_feature].notna().sum( ) / ds.shape[0] if tmp_feature_hit_rate > tmp_hit_rate: tmp_hit_rate = tmp_feature_hit_rate tmp_output_feature = tmp_feature if tmp_output_feature != '': mut_exc_feature.append(tmp_output_feature) printlog('feature_padding_on_hit_rate: mut_exc_feature: {}'.format( mut_exc_feature), printable=False) # if suffix in tmp[0]: # mut_exc_feature.append(preffix_patterns[0][1:] + suffix) # elif suffix not in tmp[0]: # mut_exc_feature.append(preffix_patterns[1][1:] + suffix if suffix in tmp[1] else preffix_patterns[2][1:] + suffix) return mut_exc_feature
def dull_feature(ds, threshold, label_column, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.Dataframe, dataset or dataset path threshold: int, features are checked by threshold number of dull data encoding(default 'utf-8'): str, encoding of dataset header(default 0): int, works on pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0): int, works in pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) # Instructions: Discard features that have oversized appearance of single (value, label) pair. ''' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds label_column = ds.columns[label_column] if isinstance( label_column, int) else label_column # if isinstance(label_column, int): # printlog('dull samples: {}/{} features contain more than {} dull samples'.format( # (ds.iloc[:, ds.columns != ds.columns[label_column]] # .apply(lambda column: column.astype(str) + '_' + ds.iloc[:, label_column].astype(str)) # .apply(lambda column: column.value_counts().max() / column.value_counts().sum()) # > threshold).sum(), # ds.columns.size - 1, # threshold # )) printlog( 'dull samples: {}/{} features contain more than {} dull samples'. format((ds.iloc[:, ds.columns != label_column].apply( lambda column: column.astype(str) + '_' + ds.loc[:, label_column]. astype(str)).apply(lambda column: column.value_counts().max() / column.value_counts().sum()) > threshold).sum(), ds.columns.size - 1, threshold))
def gridCVSelection(estimator, estimator_name, save_folder, train_features, train_label, valid_features, valid_label, grid_params, grid_scorers, refit_scorer, n_jobs=-1): """ # Example: ``` xgb = XGBClassifier() train_dataset = pd.read_csv(ds_train, header=0, index_col=0) valid_dataset = pd.read_csv(ds_valid, header=0, index_col=0) xgb_params = {'max_depth': [3, 4, 5], 'n_estimators': range(10, 301, 10)} xgb_scorer = ['neg_mean_squared_error', 'roc_auc'] Assess.gridCVSelection(xgb, 'xgb', 'misc', train_dataset.loc[:, selected_features], train_dataset.iloc[:,-1], valid_dataset.loc[:, selected_features], valid_dataset.iloc[:,-1], xgb_params, xgb_scorer, refit_scorer='roc_auc') ``` """ printlog('Assess.gridCVSelection: {} started.'.format(estimator_name)) grid = GridSearchCV(estimator, grid_params, grid_scorers, refit=refit_scorer, n_jobs=n_jobs) grid.fit(X=train_features, y=train_label) train_CV_result = grid.cv_results_ grid.fit(X=valid_features, y=valid_label) valid_CV_result = grid.cv_results_ if estimator_name and save_folder: _plotGridCVResult(estimator_name, save_folder, grid_params, grid_scorers, train_CV_result, valid_CV_result) bias = train_CV_result['mean_test_{}'.format(refit_scorer)] - valid_CV_result['mean_test_{}'.format(refit_scorer)] variance = np.power(valid_CV_result['std_test_{}'.format(refit_scorer)], 2) error = bias + variance expected_CV_result = train_CV_result['mean_test_{}'.format(refit_scorer)] + error printlog('Assess.gridCVSelection: optimal params: {}'.format(train_CV_result['params'][np.argmax(expected_CV_result)])) printlog('Assess.gridCVSelection: {} finished.'.format(estimator_name)) return train_CV_result['params'][np.argmax(expected_CV_result)]
def feature_type(ds, file_path=None, save_graph=True, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.DataFrame, dataset # Instructions: Check dataset feature types ''' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds type_count = ds.dtypes.value_counts() printlog('FEATURE TYPECOUNT: \n{}'.format(type_count)) if file_path and save_graph: assert re.search(r'.png', file_path) or re.search( r'.jpg', file_path ) or re.search( r'.jpeg', file_path ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix' # printlog([(str)(value) for value in type_count.index.values]) # printlog(type_count.values) plt.bar([(str)(value) for value in type_count.index.values], type_count.values) plt.title('Feature type') plt.xlabel('dtype') plt.ylabel('Feature count') plt.savefig(file_path) plt.close() if file_path and not save_graph: assert re.search( r'.csv', file_path ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix' type_count.to_csv(file_path, encoding=encoding, header=header)
def poor_sample(ds, threshold, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.Dataframe, dataset or dataset path threshold: int, samples are checked by threshold number of notNa features encoding(default 'utf-8'): str, encoding of dataset header(default 0): int, works on pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0): int, works in pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) ''' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds printlog('poor samples: {}/{} samples contain more than {} notNa features'. format((ds.notna().sum(axis=1) > threshold).sum(), ds.index.size, threshold))
def feature_EDA(ds, features, label_column=None, encoding='utf-8', printable=True, header=0, index_col=0): ''' # Params: ds: str/pd.DataFrame, dataset features: (list of )feature str ''' assert isinstance(features, (str, list, np.array, pd.Series)), 'EDA.feature_EDA: features should be str, list, np.array or pd.Series; input in {}'.format(type(features)) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = [features] if isinstance(features, str) else features if label_column: label_column = ds.columns[label_column] if isinstance(label_column, int) else label_column for feature in features: printlog('feature {} has values {} of dtypes {}, distribution {}, label distribution {}'.format( feature, list(set(np.ravel(ds[ds[feature].notna()][feature].values))), list(set(np.ravel(ds[feature].values.dtype))), list(ds[ds[feature].notna()][feature].value_counts().values), list(ds[ds[feature].notna()][label_column].value_counts().values) if label_column else '(label_column not given)' ), printable=printable)
def feature_na(ds, features, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.DataFrame, dataset features: (list of )feature str # Instructions: Show na data numbers in features. ''' assert isinstance(features, (str, list, np.array, pd.Series)), 'EDA.feature_na: unexpected features: {}'.format(type(features)) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = [features] if isinstance(features, str) else features for feature in features: printlog('feature {} has {}/{} na sample(s)'.format( feature, ds[feature].isnull().sum(), ds.index.size ))
def optimalCutoff(estimator, features, labels): assessment = [] pred = estimator.predict_proba(features)[:, 0] printlog('dataset size: {}'.format(labels.shape)) printlog('dataset label 0: {}'.format((labels == 0).sum())) for cutoff in [i / 1000 for i in range(1, 1001)]: pred[pred > cutoff] = 1 pred[pred <= cutoff] = 0 true_neg = ((pred == 1) & (labels == 0)).sum() false_neg = ((pred == 1) & (labels == 1)).sum() assessment.append(true_neg * 0.1 - false_neg * 0.4) printlog('optimalCutoff: {}'.format( (np.array(assessment).argmax() + 1) / 1000)) printlog('optimalCutoff target function: {}'.format( assessment[np.array(assessment).argmax()])) return (np.array(assessment).argmax() + 1) / 1000
def select_feature_iv(ds, features, label_column, strict_upper_bound, strict_lower_bound, to_file=None, encoding='utf-8', header=0, index_col=0, informative=True): printlog('Temp_support.select_feature_iv: started.', printable=informative) assert strict_upper_bound > strict_lower_bound, 'Temp_support.select_feature_iv: strict_upper_bound should be larger than strict_lowr_bound' ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = [features] if isinstance(features, (str, int)) else features features = [ds.columns[f] if isinstance(f, int) else f for f in features] printlog('Temp_support.select_feature_iv: calculating feature iv...') features_iv = feature_iv(ds, features, label_column, encoding=encoding, header=header, index_col=index_col) if to_file: printlog('Temp_support.select_feature_iv: saving to path {}...'.format( to_file)) pd.DataFrame(features_iv, index=features, columns=['iv']).to_csv(to_file, encoding=encoding) printlog('Temp_support.select_feature_iv: temporary feature iv: {}'.format( [(feature, iv) for feature, iv in zip(features, features_iv)]), printable=False) printlog('Temp_support.select_feature_iv: finished.', printable=informative) return [ feature for feature, iv in zip(features, features_iv) if iv > strict_lower_bound and iv < strict_upper_bound ]
def fill_na(ds, features, replacement=-99, flag_feature=None, flag_replacement=None, save_path=None, encoding='utf-8', header=0, index_col=0, informative=True): ''' # Params: ds: str/pd.DataFrame, dataset features: (list of )str, checked features replacement(default -99): int, replacement for na data flag_feature(default None): str, possible flag feature for alternative replacement flag_replacement(default None): int, alternative replacement for possible flag feature # Returns: pd.DataFrame of ds # Instructions: Replace na data of given features, replace with flag_replacement if flag_feature is 1. ''' printlog('Preprocess.fill_na: started.', printable=informative) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = ds.columns if features == 'all' else features features = [features] if isinstance(features, str) else features for feature in features: if flag_feature and flag_replacement: printlog('fill na: feature: {}; \tflag_feature: {}'.format( feature, flag_feature), printable=False) flag_feature = ds.columns[flag_feature] if isinstance( flag_feature, int) else flag_feature ds.loc[(ds[feature].isna()) & (ds[flag_feature] == 1), feature] = flag_replacement ds.loc[(ds[feature].isna()) & (ds[flag_feature] == 0), feature] = replacement elif (not flag_feature) or (not flag_replacement): ds.loc[(ds[feature].isna()), feature] = replacement if ds[feature].isna().any(): raise Exception('still na') if save_path: ds.to_csv(save_path, encoding=encoding) printlog('Preprocess.fill_na: finished.', printable=informative) return ds
def split_measure(label_train, label_test, labels, encoding='utf-8', header=0, index_col=0): ''' # Params: label_train: str/pd.DataFrame, trainset label label_testL str/pd.DataFrame, testset label # Instructions: Check the distribution of labels between trainset and testset. ''' assert type(label_train) in [ str, pd.DataFrame ], 'Preprocess.split_measure: input should be str or pd.Dataframe' assert type(label_test) in [ str, pd.DataFrame ], 'Preprocess.split_measure: input should be str or pd.Dataframe' if type(label_train) == str: label_train = pd.read_csv(label_train, encoding=encoding, header=header, index_col=index_col) if type(label_test) == str: label_test = pd.read_csv(label_test, encoding=encoding, header=header, index_col=index_col) train_size = label_train.shape[0] test_size = label_test.shape[0] printlog('Trainset: ') for label in labels: printlog('\tlabel {}: {}'.format( label, round((label_train == label).values.sum() / train_size, 3))) printlog('Testset: ') for label in labels: printlog('\tlabel {}: {}'.format( label, round((label_test == label).values.sum() / test_size, 3)))
def cut(ds, features, threshold=10, bin=None, method='equal-distance', label_column=None, save_path=None, encoding='utf-8', header=0, index_col=0, informative=True): printlog('Temp_support.cut: started.', printable=informative) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = [features] if isinstance(features, (str, int)) else features features = [ds.columns[f] if isinstance(f, int) else f for f in features] assert not ds.loc[:, features].isna().values.any( ), 'Temp_support.cut: ds should not contain na data' # features = [feature for feature in features if len(list(set(np.ravel(ds[[feature]].values)))) > threshold] features = [ feature for feature in features if ds[feature].unique().size > threshold ] for feature in features: printlog('Temp_support.cut: cutting {}'.format(feature), printable=False) if method == 'equal-distance': assert bin, 'Temp_support.cut: bin should be input' ds.loc[:, feature] = pd.cut(ds[feature], bin) elif method == 'equal-frequency': assert bin, 'Temp_support.cut: bin should be input' ds.loc[:, feature] = pd.qcut(ds[feature], bin, duplicates='drop') elif method == 'optimal': assert label_column, 'Temp_support: optimal cut should give label column' label_column = ds.columns[label_column] if isinstance( label_column, int) else label_column max_depth = (int)(np.log2(bin)) + 1 if bin else 4 min_leaf = (int)(ds[feature].unique().size / (2**max_depth)) + 1 ds.loc[:, feature] = optimal_cut(ds[feature], ds[label_column], max_depth, min_leaf) # (int)(np.log2(ds[feature].unique().size)) - 2)) if save_path: ds.to_csv(save_path, encoding=encoding) printlog('Temp_support.cut: finished.', printable=informative)
def tree_classifier(ds, features, label_column, max_depth=None, export_path=None, fill_na=None, fill_cat=None, encoding='utf-8', header=0, index_col=0, informative=True): printlog('Model.tree_classifier: started.', printable=informative) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = [features] if isinstance(features, str) else features label_column = ds.columns[label_column] if isinstance( label_column, int) else label_column assert fill_na or ds.loc[:, features].isna().sum().sum( ) == 0, 'Model.tree_classifier: features contains na data; fill_na must be given' assert fill_cat or np.dtype('O') not in list( map(lambda column: ds[column].dtype, features) ), 'Model.tree_classifier: features contains categorical data; fill_cat must be given' if fill_na: ds = fill_na(ds, features) if fill_cat: ds, encoder, features = fill_cat(ds, features) # print('features after fill_cat: {}'.format(features)) clt = tree.DecisionTreeClassifier(max_depth=max_depth) # print('ds[label_column]: {}'.format(ds.loc[:, label_column].head())) # print(ds.head()) clt = clt.fit(ds.loc[:, features], ds.loc[:, label_column]) if export_path: assert re.search( '.dot', export_path ), 'Model.tree_classifier: export_path should be in dot format' tree.export_graphviz(clt, export_path, feature_names=features) else: printlog(tree.export_graphviz(clt)) printlog('Model.tree_classifier: finished.', printable=informative) if not fill_cat: return clt elif fill_cat: return clt, encoder, features
def fill_cat(ds, features, method='label_encoder', save_path=None, encoding='utf-8', header=0, index_col=0, informative=True): ''' # Introductions: Automatically check given features and encode categorical columns in pd.DataFrame into numerical-encoded or one-hot-encoded columns. Learn more at: label_encoder https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html and label_binarizer https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html For label binarizer, columns with categorical data are removed and new columns are generated by \'[old feature name]_[categorical value]\' # Params: ds: str/pd.DataFrame, (path of )dataset features: (list of )features method(default 'label_encoder'): 'label_encoder' or 'label_binarizer' save_path(default None): str, path of encoded dataset, optional for label encoder while required for label binarizer # Returns: Encoded dataset in pd.DataFrame(the newly-generated columns are not inserted in the middle for label binarizer) sklearn.preprocessing.LabelEncoder()/sklearn.preprocessing.LabelBinarizer(), with attribute encoder.classes_ as the sequence of the encoding List of new features(the newly-generated features are not inserted in the middle for label binarizer) ''' printlog('Preprocess.fill_cat: started.', printable=informative) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds features = ds.columns if features == 'all' else features features = [features] if isinstance(features, str) else features if method == 'label_encoder': encoder = sklearn.preprocessing.LabelEncoder() categorical_features = ds.loc[:, ds.dtypes[features] == np.dtype('O')].columns printlog('Preprocess.fill_cat: categorical features: {}'.format( categorical_features), printable=False) ds.loc[:, categorical_features] = ds[categorical_features].apply( lambda column: encoder.fit_transform(column.astype(str))) # printlog(ds.dtypes[features] == np.dtype('O')) # printlog(ds[features].loc[:, ds.dtypes[features] == np.dtype('O')]) # ds[features].loc[:, ds.dtypes[features] == np.dtype('O')].apply(lambda column: encoder.fit_transform(column.values.ravel()), axis=0) # ds.loc[:, features] = ds[features].mask(ds.dtypes[features] == np.dtype('O'), # lambda target_df: target_df.apply(lambda column: encoder.fit_transform(column.astype(str)))) # for feature in features: # if ds[feature].dtype == np.dtype('O'): # encoder.fit(list(set(np.ravel(ds[feature].astype(np.dtype('O')).values)))) # # encoder.classes_ = list(set(np.ravel(ds[feature].astype(np.dtype(str)).values))) # # print(encoder.classes_) # ds.loc[:, feature] = encoder.transform(ds[feature].astype(np.dtype(str))) # # print(list(set(np.ravel(ds[feature].values)))) # if ds[feature].dtype == np.dtype('O'): # raise Exception('still categorical') elif method == 'label_binarizer': assert save_path, 'Preprocess.fill_cat: method \'label_binarizer\' split categorical feature into one-hot features, therefore new ds must be saved' encoder = sklearn.preprocessing.LabelBinarizer() for feature in features: if ds[feature].dtype == np.dtype('O'): feature_suffix = list( set(np.ravel(ds[feature].astype(str).values))) tmp_new_feature = [ feature + '_' + suffix for suffix in feature_suffix ] features.extend(tmp_new_feature) encoder.fit(feature_suffix) # print(encoder.classes_) tmp_new_ds = pd.DataFrame(encoder.transform(ds[feature].astype( np.dtype(str))), columns=tmp_new_feature, index=ds.index) ds = pd.concat([tmp_new_ds, ds], axis=1) del ds[feature] features.remove(feature) if feature in ds.columns: raise Exception('still categorical') if save_path: ds.to_csv(save_path, encoding=encoding) # print(ds.loc[:, features].head()) printlog('Preprocess.fill_cat: finished.', printable=informative) return ds, encoder, features
def two_layer_tree(ds, feature_1, feature_2, label_column, to_file=None, printable=False, encoding='utf-8', header=0, index_col=0): ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds feature_1 = ds.columns[feature_1] if isinstance(feature_1, int) else feature_1 feature_2 = ds.columns[feature_2] if isinstance(feature_2, int) else feature_2 label_column = ds.columns[label_column] if isinstance( label_column, int) else label_column value_1 = list(set(np.ravel(ds[ds[feature_1].notna()][feature_1].values))) value_2 = list(set(np.ravel(ds[ds[feature_2].notna()][feature_2].values))) label = list( set(np.ravel(ds[ds[label_column].notna()][label_column].values))) if printable: for v1 in value_1: for v2 in value_2: printlog( 'value - {}: {}, {}: {}; dist - {}: {}, {}: {}'.format( value_1, v1, value_2, v2, label[0], ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) & (ds[label_column] == label[0])].shape[0], label[1], ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) & (ds[label_column] == label[1])].shape[0])) if to_file: with open(to_file, 'w+') as file: file.write('''digraph Tree {} node [shape=box] ; 0 [label=\"{}\"] ; '''.format('{', feature_1)) for i, v1 in enumerate(value_1): file.write('0 -> {} ;\n'.format(i + 1)) file.write('{} [label=\"{}\\n{}: {}\\n{}: {}\"] ;\n'.format( i + 1, v1, label[0], ds[(ds[feature_1] == v1) & (ds[label_column] == label[0])].shape[0], label[1], ds[(ds[feature_1] == v1) & (ds[label_column] == label[1])].shape[0])) for j, v2 in enumerate(value_2): file.write( '{} [label=\"{}\\n{}: {}\\n{}: {}\"] ;\n'.format( len(value_1) + len(value_2) * i + j + 1, v2, label[0], ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) & (ds[label_column] == label[0])].shape[0], label[1], ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) & (ds[label_column] == label[1])].shape[0])) file.write('{} -> {} ;\n'.format( i + 1, len(value_1) + len(value_2) * i + j + 1 # round(2 / math.cos(60 - 120 / (len(value_2) - 1) * j), 3), # 60 - 120 / (len(value_2) - 1) * j )) file.write('{} [label=\"layer 1: {}\\nlayer 2: {}\"] ;\n'.format( len(value_1) * len(value_2) + len(value_1) + 1, feature_1, feature_2)) file.write('}')
def run(): printlog( '-----------------------------------start presetting-----------------------------------' ) ## hyperparams ## feature selection drop_sparse_threshold = 10 hit_pos_rate_upper = 0.5 hit_pos_rate_lower = 0.2 tree_max_depth = None iv_upper_thresh = 999 iv_lower_thresh = 0.2 lasso_alpha = 1.0 lasso_coef = 1e-05 ## model xgb_FP_grad_mul = 0.3 xgb_FN_grad_mul = 1.2 xgb_zero_proba_cutoff = 0.5 ## settings matplotlib.use('Agg') plt.rcParams['axes.unicode_minus'] = False plt.rcParams['font.family'] = 'SimHei' Log.clear_log(creative=True) ## ds_path = 'data/data.csv' # raw dataset ds_merged = 'data/ds_merged.csv' # raw dataset merged with population dataset ds_ns = 'tmp/ds_ns.csv' # merged dataset clear of sparse columns ds_na = 'tmp/ds_na.csv' # merged dataset clear of na data ds_cat = 'tmp/ds_cat.csv' # merged dataset clear of categorical feature ds_cut = 'tmp/ds_cut.csv' # merged dataset cut for IV feature selection ds_varied = 'tmp/ds_varied.csv' # merged dataset varied ds_train = 'tmp/ds_train.csv' # split train dataset ds_valid = 'tmp/ds_valid.csv' # split validation dataset ds_test = 'tmp/ds_test.csv' # split test dataset iv_detail = 'iv/iv_detail.csv' # dataset with feature IVs lasso_detail = 'lasso/lasso_detail.csv' # dataset with feature lasso coefficients xgb_detail = 'xgb/xgb_detail.csv' # dataset with feature xgb importances fe_iv = 'features/fe_iv.csv' # selected feature by IV fe_lasso = 'features/fe_lasso.csv' # selected feature by lasso coefficients fe_xgb = 'features/fe_xgb.csv' # selected feature by xgb importances tree_gate = 'tmp/tree_gate.joblib' # trained tree model model_xgb = 'xgb/model_xgb.joblib' # trained xgb model model_xgb_optim = 'xgb/model_xgb_optim.joblib' # trained xgb model optimized model_stacking = 'tmp/model_stacking.joblib' # trained stacking model plot_gate_tree = 'tmp/gate_tree.dot' # plot of tree model fe_gate_hit = 'features/fe_gate_hit.csv' # selected gate feature fe_gate_tree = 'features/fe_gate_tree.csv' # selected tree feature cutoff_xgb = 'tmp/cutoff.txt' cutoff_xgb_optim = 'tmp/cutoff_optim.txt' ## class 1, 2, 4 variables fe_gate_pattern = ['^sl_', '^fr_', '^alu_'] ## class 3, 5, 6, 7, 8 variables fe_model_pattern = ['^ir_', '^als_', '^cf_', '^cons_', '^pd_'] # printlog('-----------------------------------feature preprocess-----------------------------------') # printlog('-----------------------------------prepare dataset-----------------------------------') # Preprocess.drop_sparse(ds_merged, 'all', threshold=drop_sparse_threshold, save_path=ds_ns, encoding='gb18030') # Preprocess.fill_na(ds_ns, 'all', replacement=-1, save_path=ds_na, encoding='gb18030') # Preprocess.fill_cat(ds_na, 'all', save_path=ds_cat, encoding='gb18030') # varyDataset(ds=ds_cat, save_path=ds_varied) # generateExperienceFeature(ds_varied) # train_fe, valid_fe, test_fe, train_lb, valid_lb, test_lb = Preprocess.train_validation_test_split(ds_varied, -1, 0.8, 0.05, 0.15, encoding='gb18030') # printlog('train label proportion: {}; '.format(train_lb.sum() / train_lb.count())) # printlog('validation label proportion: {}; '.format(valid_lb.sum() / valid_lb.count())) # printlog('test label proportion: {}; '.format(test_lb.sum() / test_lb.count())) # printlog('train feature shape: {}; '.format(train_fe.shape)) # printlog('validation feature shape: {}; '.format(valid_fe.shape)) # printlog('test feature shape: {}; '.format(test_fe.shape)) # pd.concat([train_fe, train_lb], axis=1, sort=True).to_csv(ds_train, encoding='gb18030') # pd.concat([valid_fe, valid_lb], axis=1, sort=True).to_csv(ds_valid, encoding='gb18030') # pd.concat([test_fe, test_lb], axis=1, sort=True).to_csv(ds_test, encoding='gb18030') # printlog('-----------------------------------feature selection-----------------------------------') # printlog('-----------------------------------feature selection on gate feature and tree classifier-----------------------------------') # fe_gate = refreshModelFeature(ds_train, fe_gate_pattern) # ## gate feature # fe_gate_upper = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_upper, na_replacement=-1, encoding='gb18030') # fe_gate_lower = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_lower, na_replacement=-1, encoding='gb18030') # Log.itersave(fe_gate_hit, fe_gate_upper) # Log.itersave(fe_gate_tree, [fe for fe in fe_gate_lower if fe not in fe_gate_upper]) # ## tree model # tcl = Model.tree_classifier( # ds=ds_train, features=Log.iterread(fe_gate_tree), label_column=-1, # max_depth=tree_max_depth, encoding='gb18030', export_path=plot_gate_tree) ## only if fill_cat apply method='label_binarizer' should tree features be refreshed. # dump(tcl, tree_gate) # printlog('-----------------------------------feature selection on IV-----------------------------------') # fe_model = refreshModelFeature(ds_train, fe_model_pattern) # ## redo below 1 line only if change threshold and bin or totally rebuild # Temp_support.cut(ds_train, fe_model, threshold=10, bin=10, method='equal-frequency', save_path=ds_cut, encoding='gb18030') # Temp_support.select_feature_iv(ds_cut, fe_model, -1, iv_upper_thresh, iv_lower_thresh, to_file=iv_detail, encoding='gb18030') # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0) # ds_temp.sort_values('iv', ascending=False).head(5).to_csv(fe_iv) # # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0)['iv'] # # ds_temp[ds_temp.between(iv_lower_thresh, iv_upper_thresh)].to_csv(fe_iv, header='iv') from utils.Simplify import method_iteration, results_archive # def func_whot_return(going): # print('func: go {} with bebe'.format(going)) # def func_with_return(going, being): # print('func: go {} with {}'.format(going, being)) # return going, being # value_non = None # value_str = 'bebe' # value_lst_sin = [['bebe']] # value_lst_mul = ['bebe', 'gogo'] # param_str = {'going': value_str, 'being': value_str} # param_lst_sin = {'going': value_lst_sin, 'being': value_lst_sin} # param_lst_mul = {'going': value_lst_mul, 'being': value_lst_mul} # param_lst_mix = {'going': value_lst_sin, 'being': value_lst_mul} # param_str_non = {'going': value_str, 'being': value_non} # param_sin_non = {'going': value_lst_sin, 'being': value_non} # param_mul_non = {'going': value_lst_mul, 'being': value_non} # keys = [ # ['going', 'bebe'], # ['going', 'bebe'], # None, # 'x' # ] # func_res1, func_res2, func_res3, func_res4 = results_archive( # results=method_iteration( # methods=[func_with_return, func_with_return, func_whot_return, lambda x: x+1], # params=[param_lst_mix, param_lst_mul, value_lst_sin, {'x': [1,2,3]}]), # keys=keys, listed=False) # printlog('func 1 res: {}'.format(func_res1)) # printlog('func 2 res: {}'.format(func_res2)) # printlog('func 3 res: {}'.format(func_res3)) # printlog('func 4 res: {}'.format(func_res4)) # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------') # classed_fe_model = Preprocess.pattern_to_feature(ds_train, fe_model_pattern, encoding='gb18030') # ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0) # listed_all_lasso_coef = [] # listed_best_lasso_coef = [] # listed_all_xgb_imprt = [] # listed_best_xgb_imprt = [] # for fe_model in tqdm(classed_fe_model): # best_feaures, all_features = Feature_selection.select_on_lasso( # X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1], # lasso_params={'alpha': lasso_alpha}, sort_index=2, sorted=True, # encoding='gb18030') # listed_best_lasso_coef.append(best_feaures) # listed_all_lasso_coef.append(all_features) # best_feaures, all_features = Feature_selection.select_on_xgb( # X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1], # xgb_params={'alpha': lasso_alpha}, sort_index=2, sorted=True, # encoding='gb18030') # listed_best_xgb_imprt.append(best_feaures) # listed_all_xgb_imprt.append(all_features) # pd.concat(listed_all_lasso_coef, axis=0).to_csv(lasso_detail, encoding='gb18030', header='lasso_coef') # pd.concat(listed_best_lasso_coef, axis=0).to_csv(fe_lasso, encoding='gb18030', header='lasso_coef') # pd.concat(listed_all_xgb_imprt, axis=0).to_csv(xgb_detail, encoding='gb18030', header='feature_importances') # pd.concat(listed_best_xgb_imprt, axis=0).to_csv(fe_xgb, encoding='gb18030', header='feature_importances') # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------') classed_fe_model = Preprocess.pattern_to_feature(ds_train, fe_model_pattern, encoding='gb18030') ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0) lasso_select_params = { 'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model], 'y': [ds_t.iloc[:, -1]], 'lasso_params': [{ 'alpha': lasso_alpha }], 'sort_index': [2], 'sorted': [True], 'encoding': ['gb18030'] } xgb_select_params = { 'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model], 'y': [ds_t.iloc[:, -1]], 'xgb_params': [{ 'alpha': lasso_alpha }], 'sort_index': [2], 'sorted': [True], 'encoding': ['gb18030'] } keys = [['best_lasso_features', 'all_lasso_features'], ['best_xgb_features', 'all_xgb_features']] lasso_res, xgb_res = results_archive(results=method_iteration( methods=[ Feature_selection.select_on_lasso, Feature_selection.select_on_xgb ], params=[lasso_select_params, xgb_select_params]), keys=keys, listed=False) print('lasso best features: {}'.format(lasso_res['best_lasso_features'])) print('xgb best features: {}'.format(xgb_res['best_xgb_features'])) # printlog('-----------------------------------features-----------------------------------') # hitrate_features = Log.iterread(fe_gate_hit) # tree_features = Log.iterread(fe_gate_tree) # # selected_features = [ # # 'als_m12_id_nbank_orgnum', 'als_m3_id_cooff_allnum', # # 'ir_id_x_cell_cnt', 'als_m6_id_rel_allnum', # # 'als_fst_id_nbank_inteday', 'cons_tot_m12_visits','pd_gender_age'] # selected_features = [] # selected_features.extend(pd.read_csv(fe_iv, encoding='gb18030', header=0, index_col=0).index.tolist()) # selected_features.extend(pd.read_csv(fe_xgb, encoding='gb18030', header=0, index_col=0).index.tolist()) # selected_features.extend(pd.read_csv(fe_lasso, encoding='gb18030', header=0, index_col=0).index.tolist()) # selected_features = list(set(selected_features)) # printlog('Selected features: {}'.format(selected_features), printable=False) # printlog('-----------------------------------prepare train dataset-----------------------------------') # train_dataset = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0) # valid_dataset = pd.read_csv(ds_valid, encoding='gb18030', header=0, index_col=0) # X_train = train_dataset.loc[:, selected_features].values # y_train = train_dataset.iloc[:,-1] # X_valid = valid_dataset.loc[:, selected_features].values # y_valid = valid_dataset.iloc[:,-1] # printlog('-----------------------------------train on xgb-----------------------------------') # def objective(y_true, y_pred): # multiplier = pd.Series(y_true).mask(y_true == 1, xgb_FN_grad_mul).mask(y_true == 0, xgb_FP_grad_mul) # grad = multiplier * (y_pred - y_true) # hess = multiplier * np.ones(y_pred.shape) # return grad, hess # xgb_params = {'max_depth': range(1, 11), 'n_estimators': range(270, 280, 1), 'objective': [objective], 'random_state': [1], 'seed': [1]} # xgb_grid_plot = 'tmp/grid_XGB_optim' # best_model, best_score, _, _ = Assess.gridTrainValidSelection( # XGBClassifier(), xgb_params, X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set] # metric=roc_auc_score, greater_is_better=True, # scoreLabel='ROC AUC', showPlot=False, to_file=None) # printlog(best_model, best_score) # dump(XGBClassifier(), model_xgb) # dump(best_model, model_xgb_optim) # printlog('-----------------------------------calculate cutoff-----------------------------------') # for model, cutoff_model in zip([load(model_xgb), load(model_xgb_optim)], [cutoff_xgb, cutoff_xgb_optim]): # model.fit(X_train, y_train) # cutoff = optimalCutoff(model, X_valid, y_valid.to_numpy()) # Log.itersave(cutoff_model, [cutoff]) # ###########################################shit############################### # estimators = [ # ('RF', RandomForestClassifier()), # ('ET', ExtraTreesClassifier()), # ('AB', AdaBoostClassifier()), # ('GBDT', GradientBoostingClassifier()), # ('XGB', XGBClassifier()) # ] # grids = [ # { # 'n_estimators': range(10, 101, 10), # 'min_samples_leaf': [1, 5, 10, 15, 20, 25], # 'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7], # 'n_jobs': [-1], 'random_state': [1]}, # { # 'n_estimators': range(10, 101, 10), # 'min_samples_leaf': [1, 5, 10, 15, 20, 25], # 'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7], # 'n_jobs': [-1], 'random_state': [1]}, # { # 'n_estimators': range(10, 101, 10), # 'random_state': [1]}, # { # 'n_estimators': range(10, 101, 10), # 'min_samples_leaf': [1, 5, 10, 15, 20, 25], # 'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7], # 'random_state': [1]}, # { # 'n_estimators': range(10, 101, 10), # 'max_depth': range(1, 11), # 'n_jobs': [-1], 'random_state': [1]}] # grid_plots = [ # 'tmp/grid_RF.png', 'tmp/grid_ET.png', 'tmp/grid_AB.png', # 'tmp/grid_GBDT.png', 'tmp/grid_XGB.png'] # best_models = [] # for i in range(5): # best_model, best_score, all_models, all_scores = Assess.gridTrainValidSelection( # estimators[i][1], grids[i], X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set] # metric=roc_auc_score, greater_is_better=True, # scoreLabel='ROC AUC', to_file=grid_plots[i]) # printlog(best_model) # printlog(best_score) # best_models.append((estimators[i][0], best_model)) # stackingClassifier = StackingClassifier(estimators=best_models) # dump(stackingClassifier, model_stacking) # printlog('-----------------------------------train on stacking-----------------------------------') # estimators = [ # ('RF', RandomForestClassifier()), # ('ET', ExtraTreesClassifier()), # ('AB', AdaBoostClassifier()), # # ('GBDT', GradientBoostingClassifier()), # ('XGB', XGBClassifier()) # ] # estimator_params = [ # {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)}, # {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)}, # {'n_estimators': range(30, 121, 1)}, # # {'max_depth': range(10, 121, 5), 'n_estimators': range(10, 121, 5)}, # {'max_depth': range(2, 10, 1), 'n_estimators': range(10, 121, 1)} # ] # for i, (estimator, params) in enumerate(zip(estimators, estimator_params)): # estimators[i][1].set_params(**Assess.gridCVSelection( # estimator=estimator[1], estimator_name=estimator[0], save_folder='stacking', # train_features=X_train, train_label=y_train, valid_features=X_valid, valid_label=y_valid, # grid_params=params, grid_scorers=['neg_mean_squared_error', 'roc_auc'], refit_scorer='roc_auc')) # stackingClassifier = StackingClassifier(estimators=estimators) # stackingClassifier.fit(X_train, y_train) # dump(stackingClassifier, model_stacking) # printlog('-----------------------------------prepare test dataset-----------------------------------') # test_dataset = pd.read_csv(ds_test, encoding='gb18030', header=0, index_col=0) # X_test = test_dataset.loc[:, selected_features].values # y_test = test_dataset.iloc[:, -1] # printlog('-----------------------------------test on gate and tree-----------------------------------') # pred_hit = (test_dataset[hitrate_features] != -1).any(axis=1).astype(int) # pred_tree = pd.Series(load(tree_gate).predict(test_dataset[tree_features]), index=test_dataset.index) # printlog('gate test: {} labelled 1 by hit positive rate.'.format(pred_hit.sum())) # printlog('gate test: {} labelled 1 by tree classifier.'.format(pred_tree.sum())) # printlog('-----------------------------------test on xgb-----------------------------------') # prediction = recoverEstimator(model_xgb, X_train, y_train).predict(X_test) # print((prediction == 1).sum()) # prediction_optim = recoverEstimator(model_xgb_optim, X_train, y_train).predict(X_test) # # prediction = y_test.copy() # # labeled_index = prediction[prediction == 1].index.tolist() # # unlabeled_index = prediction[prediction == 0].index.tolist() # # prediction.loc[labeled_index[:89]] = 0 # # prediction.loc[unlabeled_index[:46]] = 1 # # Assess.modelAssess(y_test, prediction, '/', 'Stacking') # # Assess.confusionMatrixFromPrediction( # # y_test, prediction, [0, 1], 'Normalized matrics on Stacking', # # 'true', plt.cm.Blues, 'confusion_Stacking.png') # Assess.confusionMatrixFromPrediction( # y_test, prediction_optim, [0, 1], 'Normalized matrics on XGB_optim without cutoff', # 'true', plt.cm.Blues, 'tmp/confusion_XGB_optim_raw.png') # prediction = recoverEstimator(model_xgb, X_train, y_train).predict_proba(X_test) # prediction_optim = recoverEstimator(model_xgb_optim, X_train, y_train).predict_proba(X_test) # ## assess model # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'XGB_before_gate') # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim_before_gate') # ## apply gate prediction to xgb prediction # prediction = applyGate(prediction, pred_hit, pred_tree) # prediction_optim = applyGate(prediction_optim, pred_hit, pred_tree) # ## assess model # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'XGB') # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim') # ## apply cutoff formula # cutoff=0.9 # cutoff_optim=0.7 # prediction = applyCutoff(prediction, cutoff) # prediction_optim = applyCutoff(prediction, cutoff_optim) # Assess.confusionMatrixFromPrediction( # y_test, prediction[:, 1], [0, 1], 'Normalized matrics on XGB with cutoff', # 'true', plt.cm.Blues, 'tmp/confusion_XGB.png') # Assess.confusionMatrixFromPrediction( # y_test, prediction_optim[:, 1], [0, 1], 'Normalized matrics on XGB_optim with cutoff', # 'true', plt.cm.Blues, 'tmp/confusion_XGB_optim.png') # printlog('-----------------------------------test on stacking-----------------------------------') # prediction = recoverEstimator(model_stacking, X_train, y_train).predict(X_test) # Assess.confusionMatrixFromPrediction( # y_test, prediction, [0, 1], 'Normalized matrics on stacking without cutoff', # 'true', plt.cm.Blues, 'tmp/confusion_stacking_raw.png') # ## assess model # prediction = recoverEstimator(model_stacking, X_train, y_train).predict_proba(X_test) # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE_before_gate') # ## apply gate prediction to xgb prediction # prediction = applyGate(prediction, pred_hit, pred_tree) # ## assess model # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE') # ## apply cutoff formula # prediction = applyCutoff(prediction, cutoff=0.7) # Assess.confusionMatrixFromPrediction( # y_test, prediction[:, 1], [0, 1], 'Normalized matrics on stacking with cutoff', # 'true', plt.cm.Blues, 'tmp/confusion_stacking.png') printlog( '-----------------------------------finished-----------------------------------' )
def EDA(ds, data_type, folder=None, save_graph=True, encoding='utf-8', header=0, index_col=0, largeset=False, nrows=1000): ''' # Params: ds_path: str/pd.Dataframe , dataset path or dataset data_type: str, either 'feature' or 'label'; decides EDA mode folder(default None): str, if not None, save EDA files in the folder save_graph(default True): boolean, whether save image files encoding(default 'utf-8'): str, encoding of dataset header(default 0): int/list of int, works on pandas.read_csv() (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0): int/list of int, works on pandas.read_csv() (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) largeset(default False): boolean, whether to apply low-memory method for EDA nrows(default 1000): int, works on pandas.read_csv(), work only when largeset is True (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) ''' types = ['feature', 'label'] assert data_type in types, 'Types are not valid; should be \'feature\' or \'label\'' if type(ds) is str: printlog('---------------------EDA of {}---------------------'.format( os.path.basename(ds))) if not largeset: ds_raw = pd.read_csv( ds, encoding='utf-8', index_col=index_col, header=header) if isinstance(ds, str) else ds ## size, head and label printlog( 'SIZE: [{} sample(row) * {} feature(column)]'.format( ds_raw.shape[0], ds_raw.shape[1])) printlog('HEAD: \n{}'.format(ds_raw.head())) if data_type == 'label': printlog('LABELS: {}'.format( list(set(np.ravel(ds_raw.values))))) ## na data, feature type na_data_path, fe_data_path = None, None if folder and save_graph: na_data_path = os.path.join(folder, 'record_na_data.png') fe_data_path = os.path.join(folder, 'record_feature_type.png') if folder and not save_graph: na_data_path = os.path.join(folder, 'record_na_data.csv') fe_data_path = os.path.join(folder, 'record_feature_type.csv') na_data(ds_raw, na_data_path) feature_type(ds_raw, fe_data_path) elif largeset: if type(ds) is str: rows, columns = 0, 0 with open(ds, encoding=encoding) as file: for line in file: rows += 1 columns = len(line.split(',')) - 1 rows -= 1 printlog('[{} sample(row) * {} feature(column)]'.format( rows, columns)) ds_raw = pd.read_csv(ds, nrows=nrows, encoding=encoding, header=header, index_col=index_col) else: ds_raw = ds EDA(ds_raw, data_type, encoding=encoding, header=header, index_col=index_col) printlog('+++++++++++++++++++++EDA of {}+++++++++++++++++++++'.format( os.path.basename(ds)))
def date_feature(ds, feature, labels=None, label_column=None, file_path=None, save_graph=True, encoding='utf-8', header=0, index_col=0): ''' # Params: ds: str/pd.Dataframe, dataset or dataset path feature: str, feature in datetime format labels(default None): list, dataset labels label_column(default None): str/int, index of label column in ds or label column in ds; if both labels and label_column is not None, date_feature details will be checked by label file_path(default None): str, if not None, result of checking is saved at the path save_graph(default True): boolean, whether save result as graph or csv features(default None): list of str/np.array/pd.Series, if not None, only the corresponding features will be checked encoding(default 'utf-8'): str, encoding of dataset header(default 0): int, works on pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) index_col(default 0): int, works in pandas.read_csv() (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html) ''' assert type( feature ) == str, 'EDA_massive: feature should be str; {} is entered'.format( feature) ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds assert feature in ds.columns, 'EDA_massive: feature should be contained in ds columns' labels_countby_date = [] if labels and label_column: label_column = ds.columns[label_column] if isinstance( label_column, int) else label_column for label in labels: ds_tmp = ds[ds[label_column] == label] ds_tmp = ds_tmp[feature].astype('datetime64') labels_countby_date.append( ds_tmp.groupby([ds_tmp.dt.year, ds_tmp.dt.month]).count()) ds = ds[feature].astype('datetime64') date_count = ds.groupby([ds.dt.year, ds.dt.month]).count() printlog('FEATURE {} DATE COUNT: \n{}'.format(feature, date_count)) if file_path and save_graph: assert re.search(r'.png', file_path) or re.search( r'.jpg', file_path ) or re.search( r'.jpeg', file_path ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix' plt.figure(figsize=[10, 10]) if labels and label_column: prev = pd.Series(np.zeros(date_count.size), index=date_count.index) plt.bar([(str)(value) for value in prev.index.values], prev.values, bottom=None) for i, label in enumerate(labels_countby_date): if prev is not None: prev = pd.Series([prev[index] for index in label.index], index=label.index) # print(prev) # print([(str)(value) for value in label.index.values]) plt.bar([(str)(value) for value in label.index.values], label.values, bottom=prev, label='label: {}'.format(labels[i])) i = 0 for j, index in enumerate(date_count.index): if i == label.index.size: break if index == label.index[i]: # print('index: {}'.format(index)) plt.plot(j, label[i] + prev[i], marker='D') plt.text(j - 0.3, label[i] + prev[i] + 1, (str)(label[i])) i += 1 prev = label elif not labels or not label_column: plt.bar([(str)(value) for value in date_count.index.values], date_count.values) for i, data in enumerate(date_count.values): plt.text(i - 0.3, 2, (str)(data)) plt.title('Count on date of feature {}'.format(feature)) plt.xlabel('Date range') plt.xticks(rotation=90) plt.ylabel('Sample number') plt.legend() plt.savefig(file_path) plt.close() if file_path and not save_graph: assert re.search( r'.csv', file_path ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix' date_count.to_csv(file_path, encoding=encoding, header=header)