def _plotScores(scores, paramGrid, to_file, scoreLabel=None, greater_is_better=True, vrange=None, cmap="YlOrRd"):
    keys = sorted(list(paramGrid)[0].keys())
    uniqParams = dict()
    order = dict()
    for k in keys:
        order[k] = np.unique([str(params[k]) for params in list(paramGrid)], return_index=True)[1]
        uniqParams[k] = [params[k] for params in np.asarray(list(paramGrid))[sorted(order[k])]]

    keysToPlot = list()
    for k in keys:
        if len(uniqParams[k]) > 1:
            keysToPlot.append(k)

    for k in keys:
        if k not in keysToPlot:
            uniqParams.pop(k, None)

    numDim = len(keysToPlot)
    if numDim > 3:
        printlog("Too many dimensions to plot.")
    elif numDim == 3:
        _plot3DGrid(scores, uniqParams, keysToPlot, scoreLabel, greater_is_better, vrange, cmap, to_file)
    elif numDim == 2:
        _plot2DGrid(scores, uniqParams, keysToPlot, scoreLabel, greater_is_better, vrange, cmap, to_file)
    elif numDim == 1:
        _plot1DGrid(scores, uniqParams, scoreLabel, vrange, to_file)
    else:
        printlog("No parameters that vary in the grid")
def varyDataset(ds, save_path):
    classed_feature_preffix = [[
        '^als_d7_id_', '^als_d15_id_', '^als_m1_id_', '^als_m3_id_',
        '^als_m6_id_', '^als_m12_id_', '^als_fst_id_', '^als_lst_id_'
    ],
                               [
                                   '^als_d7_cell_', '^als_d15_cell_',
                                   '^als_m1_cell_', '^als_m3_cell_',
                                   '^als_m6_cell_', '^als_m12_cell_',
                                   '^als_fst_cell_', '^als_lst_cell_'
                               ]]
    printlog('class 5 - value padding: larger/smaller')
    ds_t = pd.read_csv(ds, encoding='gb18030', header=0, index_col=0)
    for i, (id_fc, cell_fc) in enumerate(
            zip(
                Preprocess.pattern_to_feature(ds_t,
                                              classed_feature_preffix[0],
                                              encoding='gb18030'),
                Preprocess.pattern_to_feature(ds_t,
                                              classed_feature_preffix[1],
                                              encoding='gb18030'))):
        for id_f, cell_f in zip(id_fc, cell_fc):
            ds_t.insert(loc=ds_t.columns.get_loc(id_f),
                        column=id_f.replace('id', 'large'),
                        value=ds_t[[id_f, cell_f]].apply(np.max, axis=1))
            ds_t.insert(loc=ds_t.columns.get_loc(id_f),
                        column=id_f.replace('id', 'small'),
                        value=ds_t[[id_f, cell_f]].apply(np.min, axis=1))
        printlog('class 5 - value padding finished {} and {}'.format(
            classed_feature_preffix[0][i], classed_feature_preffix[1][i]))
    ds_t.to_csv(save_path, encoding='gb18030')
示例#3
0
def outlier_data(ds,
                 file_path=None,
                 features=None,
                 measure='std',
                 threshold=3,
                 encoding='utf-8',
                 header=0,
                 index_col=0):
    '''
    # Params:

    ds: str/pd.Dataframe, dataset or dataset path

    file_path(default None): str, if not None, the result is saved in path

    features(default None): list of str/np.array/pd.Series, if not None, only corresponding features will be checked
    '''
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    series_outlier = pd.Series()
    columns = features if features else ds.columns
    if measure == 'std':
        for feature in ds.columns:
            single_feature_outlier = ds[feature][
                np.abs(ds[feature] - ds[feature].mean()) > threshold *
                ds[feature].std()].sum()
            series_outlier = series_outlier.append(pd.Series(
                [single_feature_outlier]),
                                                   ignore_index=True)
    printlog('OUTLIER:            {}/{} features(threshold: {}, measure: {})'.
             format((series_outlier > 0).values.sum(), series_outlier.size,
                    threshold, measure))
    if file_path:
        series_outlier.to_csv(file_path, encoding=encoding, header=True)
示例#4
0
def drop_sparse(ds,
                features,
                threshold,
                save_path=None,
                encoding='utf-8',
                header=0,
                index_col=0):
    '''
    # Params:

    ds: str/pd.DataFrame, dataset

    threshold: int, threshold for least notna samples of feature

    '''
    printlog('Preprocess.drop_sparse: started.')
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = ds.columns if features == 'all' else features
    features = [features] if isinstance(features, str) else features
    columns_todrop = ds[features].loc[:, ds[features].notna().sum(
        axis=0) <= threshold]
    ds = ds.drop(columns=columns_todrop.columns)
    if save_path:
        ds.to_csv(save_path, encoding=encoding)
    printlog('Preprocess.drop_sparse: finished. {} features dropped.'.format(
        columns_todrop.shape[1]))
def generateExperienceFeature(ds):
    printlog(
        '-----------------------------------generate experience feature-----------------------------------'
    )
    ds_temp = pd.read_csv(ds, encoding='gb18030', header=0, index_col=0)
    series_t = pd.Series(ds_temp['cons_tot_m12_visits'], ds_temp.index)
    series_t[series_t.between(-99.001, -0.001)] = -99
    series_t[series_t.between(-0.001, 500.001)] = 500
    series_t[series_t.between(500.001, 1000.001)] = 1000
    series_t[series_t.between(1000.001, 1500.001)] = 1500
    series_t[series_t.between(1500.001, 900000)] = 9000
    ds_temp.loc[:, 'cons_tot_m12_visits'] = series_t

    series_t = pd.Series(data=-1, index=ds_temp.index)
    series_t[(ds_temp['pd_id_gender'] == 0)
             & (ds_temp['pd_id_apply_age'].between(-99.001, 30.001))] = 0
    series_t[(ds_temp['pd_id_gender'] == 0)
             & (ds_temp['pd_id_apply_age'].between(30.001, 60.001))] = 1
    series_t[(ds_temp['pd_id_gender'] == 0)
             & (ds_temp['pd_id_apply_age'].between(60.001, 999.001))] = 2
    series_t[(ds_temp['pd_id_gender'] == 1)
             & (ds_temp['pd_id_apply_age'].between(-0.001, 24.001))] = 3
    series_t[(ds_temp['pd_id_gender'] == 1)
             & (ds_temp['pd_id_apply_age'].between(24.001, 35.001))] = 4
    series_t[(ds_temp['pd_id_gender'] == 1)
             & (ds_temp['pd_id_apply_age'].between(35.001, 45.001))] = 5
    series_t[(ds_temp['pd_id_gender'] == 1)
             & (ds_temp['pd_id_apply_age'].between(45.001, 999.001))] = 2
    if 'pd_gender_age' not in ds_temp.columns:
        ds_temp.insert(ds_temp.columns.size - 1, 'pd_gender_age', series_t)
    else:
        ds_temp.loc[:, 'pd_gender_age'] = series_t
    ds_temp.to_csv(ds, encoding='gb18030')
示例#6
0
def sparse_feature(ds,
                   features=None,
                   file_path=None,
                   measure='std',
                   threshold=0.01,
                   largeset=False,
                   encoding='utf-8',
                   header=0,
                   index_col=0):
    '''
    # Params: 

    ds: pandas.Dataframe, numpy.ndarray or str of dataset path shaped [n of samples, n of features]

    features(default None): str/list of str, if not None, only corresponding features in ds will be checked

    file_path(default None): str, if not None, result is saved at the path

    measure(default 'std'): str, either 'mean' or 'std', deciding the calculation of feature performance
    (mean threshold are compared with features' absolute means)

    threshold(default 0.01): float, threshold for deciding whether a feature is sparse

    largeset(default Faslse): boolean, whether to apply low-memory method for sparse detection

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int, works on pandas.read_csv()
    (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0 ): int, works on pandas.read_csv()
    (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    # Return:
    
    pandas.Series, boolean values of shape [n of features]
    (true represent the feature is not sparse; False represent sparse)

    '''
    assert measure in [
        'mean', 'std'
    ], 'EDA.sparse_feature: parameter measure should be either \'mean\' or \'std\', {} is given'.format(
        measure)
    if type(ds) == str:
        ds = pd.read_csv(ds,
                         encoding=encoding,
                         header=header,
                         index_col=index_col)
    if measure == 'mean':
        insparse_feature = ((ds != 0).abs().mean() > threshold)
    elif measure == 'std':
        insparse_feature = (ds != 0).std() > threshold
    printlog(
        'SPARSE:            {}/{} features(threshold: {}, measure: {})'.format(
            (insparse_feature == True).sum(), insparse_feature.size, threshold,
            measure))
    if file_path:
        insparse_feature.to_csv(file_path, encoding=encoding, header=True)
def gridTrainValidSelection(estimator, grid, X_train, y_train, X_valid, y_valid, metric=roc_auc_score, scoreLabel='ROC AUC', to_file=None, 
showPlot=True, n_jobs=-1, verbose=10, predict_proba=True, greater_is_better=True, vrange=None, cmap=plt.cm.Blues):
    paramGrid = ParameterGrid(grid)
    printlog("-------------FITTING MODELS-------------")
    models = fitModels(estimator, paramGrid, X_train, y_train, n_jobs, verbose)
    printlog("-------------SCORING MODELS-------------")
    scores = scoreModels(models, X_valid, y_valid, metric, predict_proba, n_jobs, verbose)
    if showPlot:
        _plotScores(scores, paramGrid, to_file, scoreLabel, greater_is_better, vrange, cmap)

    return getBestModel(models, scores, greater_is_better), getBestScore(scores, greater_is_better), models, scores
示例#8
0
def shape(ds, largeset=False, encoding='utf-8', header=0, index_col=0):
    '''
    # Params:

    ds: pd.Dataframe or str of dataset path
    
    '''
    if type(ds) == str:
        ds = pd.read_csv(ds, encoding=encoding, header=0, index_col=0)
    printlog('SAMPLE(row):           {}'.format(ds.shape[0]))
    printlog('FEATURE/LABEL(column): {}'.format(ds.shape[1]))
示例#9
0
def na_data(ds,
            file_path=None,
            save_graph=True,
            features=None,
            encoding='utf-8',
            header=0,
            index_col=0):
    '''
    # Params:

    ds: str/pd.Dataframe, dataset or dataset path

    file_path(default None): str, if not None, result of checking is saved at the path

    save_graph(default True): boolean, whether save result as graph or csv

    features(default None): list of str/np.array/pd.Series, if not None, only the corresponding features will be checked

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int, works on pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0): int, works in pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    '''
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    columns = features if features else ds.columns
    series_na = pd.Series()
    for feature in columns:
        series_na = series_na.append(pd.Series([ds[feature].isna().sum()]),
                                     ignore_index=True)
    printlog('NA:            {}/{} features(totally {} data)'.format(
        (series_na > 0).sum(), series_na.size, series_na.sum()))
    if file_path and save_graph:
        assert re.search(r'.png', file_path) or re.search(
            r'.jpg', file_path
        ) or re.search(
            r'.jpeg', file_path
        ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix'
        sns.distplot(series_na, kde=False)
        plt.title('Na data in features')
        plt.xlabel('feature count')
        plt.ylabel('Na data count')
        plt.savefig(file_path)
        plt.close()
    if file_path and not save_graph:
        assert re.search(
            r'.csv', file_path
        ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix'
        series_na.to_csv(file_path, encoding=encoding, header=True)
示例#10
0
def feature_padding_on_hit_rate(ds,
                                features,
                                preffix_patterns,
                                encoding='utf-8',
                                header=0,
                                index_col=0):
    ## get suffix of features in given class
    classed_class_features = Preprocess.pattern_to_feature(ds,
                                                           preffix_patterns,
                                                           encoding=encoding)
    ds = pd.read_csv(
        ds, encoding='gb18030', header=header,
        index_col=index_col) if isinstance(ds, str) else ds
    ## tmp: class suffix inflattened
    tmp = [
        list(map(lambda fc, pf=preffix: fc[len(pf) - 1:],
                 feature_class)) for preffix, feature_class in zip(
                     preffix_patterns, classed_class_features)
    ]
    class_suffix = []
    for t in tmp:
        class_suffix.extend(t)
    ## class_suffix: class suffix unique flattened
    class_suffix = list(set(class_suffix))
    # print('feature_padding: preffix_patterns = {}'.format(preffix_patterns))
    ## get features with mutually exclusive suffixs
    mut_exc_feature = []
    for suffix in class_suffix:
        tmp_hit_rate = 0
        tmp_output_feature = ''
        for i, t in enumerate(tmp):
            if suffix in t:
                tmp_feature = preffix_patterns[i][1:] + suffix
                tmp_feature_hit_rate = ds[tmp_feature].notna().sum(
                ) / ds.shape[0]
                if tmp_feature_hit_rate > tmp_hit_rate:
                    tmp_hit_rate = tmp_feature_hit_rate
                    tmp_output_feature = tmp_feature
        if tmp_output_feature != '':
            mut_exc_feature.append(tmp_output_feature)
    printlog('feature_padding_on_hit_rate: mut_exc_feature: {}'.format(
        mut_exc_feature),
             printable=False)
    # if suffix in tmp[0]:
    #     mut_exc_feature.append(preffix_patterns[0][1:] + suffix)
    # elif suffix not in tmp[0]:
    #     mut_exc_feature.append(preffix_patterns[1][1:] + suffix if suffix in tmp[1] else preffix_patterns[2][1:] + suffix)
    return mut_exc_feature
示例#11
0
def dull_feature(ds,
                 threshold,
                 label_column,
                 encoding='utf-8',
                 header=0,
                 index_col=0):
    '''
    # Params:

    ds: str/pd.Dataframe, dataset or dataset path

    threshold: int, features are checked by threshold number of dull data

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int, works on pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0): int, works in pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    # Instructions:

    Discard features that have oversized appearance of single (value, label) pair.

    '''
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    label_column = ds.columns[label_column] if isinstance(
        label_column, int) else label_column
    # if isinstance(label_column, int):
    #     printlog('dull samples: {}/{} features contain more than {} dull samples'.format(
    #         (ds.iloc[:, ds.columns != ds.columns[label_column]]
    #         .apply(lambda column: column.astype(str) + '_' + ds.iloc[:, label_column].astype(str))
    #         .apply(lambda column: column.value_counts().max() / column.value_counts().sum())
    #         > threshold).sum(),
    #         ds.columns.size - 1,
    #         threshold
    #     ))
    printlog(
        'dull samples: {}/{} features contain more than {} dull samples'.
        format((ds.iloc[:, ds.columns != label_column].apply(
            lambda column: column.astype(str) + '_' + ds.loc[:, label_column].
            astype(str)).apply(lambda column: column.value_counts().max() /
                               column.value_counts().sum()) > threshold).sum(),
               ds.columns.size - 1, threshold))
def gridCVSelection(estimator, estimator_name, save_folder, train_features, train_label, 
valid_features, valid_label, grid_params, grid_scorers, refit_scorer, n_jobs=-1):
    """
    # Example: 
    ```
    xgb = XGBClassifier()
    train_dataset = pd.read_csv(ds_train, header=0, index_col=0)
    valid_dataset = pd.read_csv(ds_valid, header=0, index_col=0)
    xgb_params          = {'max_depth': [3, 4, 5], 'n_estimators': range(10, 301, 10)}
    xgb_scorer          = ['neg_mean_squared_error', 'roc_auc']
    Assess.gridCVSelection(xgb, 'xgb', 'misc', 
        train_dataset.loc[:, selected_features], train_dataset.iloc[:,-1], 
        valid_dataset.loc[:, selected_features], valid_dataset.iloc[:,-1], 
        xgb_params, xgb_scorer, refit_scorer='roc_auc')
    ```
    """
    printlog('Assess.gridCVSelection: {} started.'.format(estimator_name))
    grid = GridSearchCV(estimator, grid_params, grid_scorers, refit=refit_scorer, n_jobs=n_jobs)
    grid.fit(X=train_features, y=train_label)
    train_CV_result = grid.cv_results_
    grid.fit(X=valid_features, y=valid_label)
    valid_CV_result = grid.cv_results_
    if estimator_name and save_folder: 
        _plotGridCVResult(estimator_name, save_folder, grid_params, grid_scorers, train_CV_result, valid_CV_result)
    bias               = train_CV_result['mean_test_{}'.format(refit_scorer)] - valid_CV_result['mean_test_{}'.format(refit_scorer)]
    variance           = np.power(valid_CV_result['std_test_{}'.format(refit_scorer)], 2)
    error              = bias + variance
    expected_CV_result = train_CV_result['mean_test_{}'.format(refit_scorer)] + error
    printlog('Assess.gridCVSelection: optimal params: {}'.format(train_CV_result['params'][np.argmax(expected_CV_result)]))
    printlog('Assess.gridCVSelection: {} finished.'.format(estimator_name))
    return train_CV_result['params'][np.argmax(expected_CV_result)]
示例#13
0
def feature_type(ds,
                 file_path=None,
                 save_graph=True,
                 encoding='utf-8',
                 header=0,
                 index_col=0):
    '''
    # Params:

    ds: str/pd.DataFrame, dataset

    # Instructions:

    Check dataset feature types

    '''
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    type_count = ds.dtypes.value_counts()
    printlog('FEATURE TYPECOUNT: \n{}'.format(type_count))
    if file_path and save_graph:
        assert re.search(r'.png', file_path) or re.search(
            r'.jpg', file_path
        ) or re.search(
            r'.jpeg', file_path
        ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix'
        # printlog([(str)(value) for value in type_count.index.values])
        # printlog(type_count.values)
        plt.bar([(str)(value) for value in type_count.index.values],
                type_count.values)
        plt.title('Feature type')
        plt.xlabel('dtype')
        plt.ylabel('Feature count')
        plt.savefig(file_path)
        plt.close()
    if file_path and not save_graph:
        assert re.search(
            r'.csv', file_path
        ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix'
        type_count.to_csv(file_path, encoding=encoding, header=header)
示例#14
0
def poor_sample(ds, threshold, encoding='utf-8', header=0, index_col=0):
    '''
    # Params:

    ds: str/pd.Dataframe, dataset or dataset path

    threshold: int, samples are checked by threshold number of notNa features

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int, works on pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0): int, works in pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    '''
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    printlog('poor samples: {}/{} samples contain more than {} notNa features'.
             format((ds.notna().sum(axis=1) > threshold).sum(), ds.index.size,
                    threshold))
示例#15
0
def feature_EDA(ds, features, label_column=None, encoding='utf-8', printable=True, header=0, index_col=0):
    '''
    # Params:

    ds: str/pd.DataFrame, dataset

    features: (list of )feature str

    '''
    assert isinstance(features, (str, list, np.array, pd.Series)), 'EDA.feature_EDA: features should be str, list, np.array or pd.Series; input in {}'.format(type(features))
    ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds
    features = [features] if isinstance(features, str) else features
    if label_column:
        label_column = ds.columns[label_column] if isinstance(label_column, int) else label_column
    for feature in features:
        printlog('feature {} has values {} of dtypes {}, distribution {}, label distribution {}'.format(
            feature, 
            list(set(np.ravel(ds[ds[feature].notna()][feature].values))),
            list(set(np.ravel(ds[feature].values.dtype))),
            list(ds[ds[feature].notna()][feature].value_counts().values),
            list(ds[ds[feature].notna()][label_column].value_counts().values) if label_column else '(label_column not given)'
        ), printable=printable)
示例#16
0
def feature_na(ds, features, encoding='utf-8', header=0, index_col=0):
    '''
    # Params:

    ds: str/pd.DataFrame, dataset

    features: (list of )feature str

    # Instructions:

    Show na data numbers in features.

    '''
    assert isinstance(features, (str, list, np.array, pd.Series)), 'EDA.feature_na: unexpected features: {}'.format(type(features))
    ds = pd.read_csv(ds, encoding=encoding, header=header, index_col=index_col) if isinstance(ds, str) else ds
    features = [features] if isinstance(features, str) else features
    for feature in features:
        printlog('feature {} has {}/{} na sample(s)'.format(
            feature, 
            ds[feature].isnull().sum(), 
            ds.index.size
        ))
def optimalCutoff(estimator, features, labels):
    assessment = []
    pred = estimator.predict_proba(features)[:, 0]
    printlog('dataset size: {}'.format(labels.shape))
    printlog('dataset label 0: {}'.format((labels == 0).sum()))
    for cutoff in [i / 1000 for i in range(1, 1001)]:
        pred[pred > cutoff] = 1
        pred[pred <= cutoff] = 0
        true_neg = ((pred == 1) & (labels == 0)).sum()
        false_neg = ((pred == 1) & (labels == 1)).sum()
        assessment.append(true_neg * 0.1 - false_neg * 0.4)
    printlog('optimalCutoff: {}'.format(
        (np.array(assessment).argmax() + 1) / 1000))
    printlog('optimalCutoff target function: {}'.format(
        assessment[np.array(assessment).argmax()]))
    return (np.array(assessment).argmax() + 1) / 1000
示例#18
0
def select_feature_iv(ds,
                      features,
                      label_column,
                      strict_upper_bound,
                      strict_lower_bound,
                      to_file=None,
                      encoding='utf-8',
                      header=0,
                      index_col=0,
                      informative=True):
    printlog('Temp_support.select_feature_iv: started.', printable=informative)
    assert strict_upper_bound > strict_lower_bound, 'Temp_support.select_feature_iv: strict_upper_bound should be larger than strict_lowr_bound'
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = [features] if isinstance(features, (str, int)) else features
    features = [ds.columns[f] if isinstance(f, int) else f for f in features]
    printlog('Temp_support.select_feature_iv: calculating feature iv...')
    features_iv = feature_iv(ds,
                             features,
                             label_column,
                             encoding=encoding,
                             header=header,
                             index_col=index_col)
    if to_file:
        printlog('Temp_support.select_feature_iv: saving to path {}...'.format(
            to_file))
        pd.DataFrame(features_iv, index=features,
                     columns=['iv']).to_csv(to_file, encoding=encoding)
    printlog('Temp_support.select_feature_iv: temporary feature iv: {}'.format(
        [(feature, iv) for feature, iv in zip(features, features_iv)]),
             printable=False)
    printlog('Temp_support.select_feature_iv: finished.',
             printable=informative)
    return [
        feature for feature, iv in zip(features, features_iv)
        if iv > strict_lower_bound and iv < strict_upper_bound
    ]
示例#19
0
def fill_na(ds,
            features,
            replacement=-99,
            flag_feature=None,
            flag_replacement=None,
            save_path=None,
            encoding='utf-8',
            header=0,
            index_col=0,
            informative=True):
    '''
    # Params:

    ds: str/pd.DataFrame, dataset

    features: (list of )str, checked features

    replacement(default -99): int, replacement for na data

    flag_feature(default None): str, possible flag feature for alternative replacement

    flag_replacement(default None): int, alternative replacement for possible flag feature

    # Returns:

    pd.DataFrame of ds

    # Instructions:

    Replace na data of given features, replace with flag_replacement if flag_feature is 1.

    '''
    printlog('Preprocess.fill_na: started.', printable=informative)
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = ds.columns if features == 'all' else features
    features = [features] if isinstance(features, str) else features
    for feature in features:
        if flag_feature and flag_replacement:
            printlog('fill na: feature: {}; \tflag_feature: {}'.format(
                feature, flag_feature),
                     printable=False)
            flag_feature = ds.columns[flag_feature] if isinstance(
                flag_feature, int) else flag_feature
            ds.loc[(ds[feature].isna()) & (ds[flag_feature] == 1),
                   feature] = flag_replacement
            ds.loc[(ds[feature].isna()) & (ds[flag_feature] == 0),
                   feature] = replacement
        elif (not flag_feature) or (not flag_replacement):
            ds.loc[(ds[feature].isna()), feature] = replacement
        if ds[feature].isna().any():
            raise Exception('still na')
    if save_path:
        ds.to_csv(save_path, encoding=encoding)
    printlog('Preprocess.fill_na: finished.', printable=informative)
    return ds
示例#20
0
def split_measure(label_train,
                  label_test,
                  labels,
                  encoding='utf-8',
                  header=0,
                  index_col=0):
    '''
    # Params:

    label_train: str/pd.DataFrame, trainset label

    label_testL str/pd.DataFrame, testset label

    # Instructions: 

    Check the distribution of labels between trainset and testset.

    '''
    assert type(label_train) in [
        str, pd.DataFrame
    ], 'Preprocess.split_measure: input should be str or pd.Dataframe'
    assert type(label_test) in [
        str, pd.DataFrame
    ], 'Preprocess.split_measure: input should be str or pd.Dataframe'
    if type(label_train) == str:
        label_train = pd.read_csv(label_train,
                                  encoding=encoding,
                                  header=header,
                                  index_col=index_col)
    if type(label_test) == str:
        label_test = pd.read_csv(label_test,
                                 encoding=encoding,
                                 header=header,
                                 index_col=index_col)
    train_size = label_train.shape[0]
    test_size = label_test.shape[0]
    printlog('Trainset: ')
    for label in labels:
        printlog('\tlabel {}: {}'.format(
            label, round((label_train == label).values.sum() / train_size, 3)))
    printlog('Testset: ')
    for label in labels:
        printlog('\tlabel {}: {}'.format(
            label, round((label_test == label).values.sum() / test_size, 3)))
示例#21
0
def cut(ds,
        features,
        threshold=10,
        bin=None,
        method='equal-distance',
        label_column=None,
        save_path=None,
        encoding='utf-8',
        header=0,
        index_col=0,
        informative=True):
    printlog('Temp_support.cut: started.', printable=informative)
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = [features] if isinstance(features, (str, int)) else features
    features = [ds.columns[f] if isinstance(f, int) else f for f in features]
    assert not ds.loc[:, features].isna().values.any(
    ), 'Temp_support.cut: ds should not contain na data'
    # features = [feature for feature in features if len(list(set(np.ravel(ds[[feature]].values)))) > threshold]
    features = [
        feature for feature in features
        if ds[feature].unique().size > threshold
    ]
    for feature in features:
        printlog('Temp_support.cut: cutting {}'.format(feature),
                 printable=False)
        if method == 'equal-distance':
            assert bin, 'Temp_support.cut: bin should be input'
            ds.loc[:, feature] = pd.cut(ds[feature], bin)
        elif method == 'equal-frequency':
            assert bin, 'Temp_support.cut: bin should be input'
            ds.loc[:, feature] = pd.qcut(ds[feature], bin, duplicates='drop')
        elif method == 'optimal':
            assert label_column, 'Temp_support: optimal cut should give label column'
            label_column = ds.columns[label_column] if isinstance(
                label_column, int) else label_column
            max_depth = (int)(np.log2(bin)) + 1 if bin else 4
            min_leaf = (int)(ds[feature].unique().size / (2**max_depth)) + 1
            ds.loc[:, feature] = optimal_cut(ds[feature], ds[label_column],
                                             max_depth, min_leaf)
            # (int)(np.log2(ds[feature].unique().size)) - 2))
    if save_path:
        ds.to_csv(save_path, encoding=encoding)
    printlog('Temp_support.cut: finished.', printable=informative)
def tree_classifier(ds,
                    features,
                    label_column,
                    max_depth=None,
                    export_path=None,
                    fill_na=None,
                    fill_cat=None,
                    encoding='utf-8',
                    header=0,
                    index_col=0,
                    informative=True):
    printlog('Model.tree_classifier: started.', printable=informative)
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = [features] if isinstance(features, str) else features
    label_column = ds.columns[label_column] if isinstance(
        label_column, int) else label_column
    assert fill_na or ds.loc[:, features].isna().sum().sum(
    ) == 0, 'Model.tree_classifier: features contains na data; fill_na must be given'
    assert fill_cat or np.dtype('O') not in list(
        map(lambda column: ds[column].dtype, features)
    ), 'Model.tree_classifier: features contains categorical data; fill_cat must be given'
    if fill_na:
        ds = fill_na(ds, features)
    if fill_cat:
        ds, encoder, features = fill_cat(ds, features)
    # print('features after fill_cat: {}'.format(features))
    clt = tree.DecisionTreeClassifier(max_depth=max_depth)
    # print('ds[label_column]: {}'.format(ds.loc[:, label_column].head()))
    # print(ds.head())
    clt = clt.fit(ds.loc[:, features], ds.loc[:, label_column])
    if export_path:
        assert re.search(
            '.dot', export_path
        ), 'Model.tree_classifier: export_path should be in dot format'
        tree.export_graphviz(clt, export_path, feature_names=features)
    else:
        printlog(tree.export_graphviz(clt))
    printlog('Model.tree_classifier: finished.', printable=informative)
    if not fill_cat:
        return clt
    elif fill_cat:
        return clt, encoder, features
示例#23
0
def fill_cat(ds,
             features,
             method='label_encoder',
             save_path=None,
             encoding='utf-8',
             header=0,
             index_col=0,
             informative=True):
    '''
    # Introductions: 

    Automatically check given features and encode categorical columns in pd.DataFrame into numerical-encoded or one-hot-encoded columns.

    Learn more at: label_encoder https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
     and label_binarizer https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
    
    For label binarizer, columns with categorical data are removed and new columns are generated by \'[old feature name]_[categorical value]\'

    # Params:

    ds: str/pd.DataFrame, (path of )dataset

    features: (list of )features

    method(default 'label_encoder'): 'label_encoder' or 'label_binarizer'

    save_path(default None): str, path of encoded dataset, optional for label encoder while required for label binarizer

    # Returns: 

    Encoded dataset in pd.DataFrame(the newly-generated columns are not inserted in the middle for label binarizer)

    sklearn.preprocessing.LabelEncoder()/sklearn.preprocessing.LabelBinarizer(), with attribute encoder.classes_ as the sequence of the encoding

    List of new features(the newly-generated features are not inserted in the middle for label binarizer)

    '''
    printlog('Preprocess.fill_cat: started.', printable=informative)
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    features = ds.columns if features == 'all' else features
    features = [features] if isinstance(features, str) else features
    if method == 'label_encoder':
        encoder = sklearn.preprocessing.LabelEncoder()
        categorical_features = ds.loc[:, ds.dtypes[features] ==
                                      np.dtype('O')].columns
        printlog('Preprocess.fill_cat: categorical features: {}'.format(
            categorical_features),
                 printable=False)
        ds.loc[:, categorical_features] = ds[categorical_features].apply(
            lambda column: encoder.fit_transform(column.astype(str)))
        # printlog(ds.dtypes[features] == np.dtype('O'))
        # printlog(ds[features].loc[:, ds.dtypes[features] == np.dtype('O')])
        # ds[features].loc[:, ds.dtypes[features] == np.dtype('O')].apply(lambda column: encoder.fit_transform(column.values.ravel()), axis=0)
        # ds.loc[:, features] = ds[features].mask(ds.dtypes[features] == np.dtype('O'),
        #     lambda target_df: target_df.apply(lambda column: encoder.fit_transform(column.astype(str))))
        # for feature in features:
        #     if ds[feature].dtype == np.dtype('O'):
        #         encoder.fit(list(set(np.ravel(ds[feature].astype(np.dtype('O')).values))))
        #         # encoder.classes_ = list(set(np.ravel(ds[feature].astype(np.dtype(str)).values)))
        #         # print(encoder.classes_)
        #         ds.loc[:, feature] = encoder.transform(ds[feature].astype(np.dtype(str)))
        #         # print(list(set(np.ravel(ds[feature].values))))
        #         if ds[feature].dtype == np.dtype('O'):
        #             raise Exception('still categorical')
    elif method == 'label_binarizer':
        assert save_path, 'Preprocess.fill_cat: method \'label_binarizer\' split categorical feature into one-hot features, therefore new ds must be saved'
        encoder = sklearn.preprocessing.LabelBinarizer()
        for feature in features:
            if ds[feature].dtype == np.dtype('O'):
                feature_suffix = list(
                    set(np.ravel(ds[feature].astype(str).values)))
                tmp_new_feature = [
                    feature + '_' + suffix for suffix in feature_suffix
                ]
                features.extend(tmp_new_feature)
                encoder.fit(feature_suffix)
                # print(encoder.classes_)
                tmp_new_ds = pd.DataFrame(encoder.transform(ds[feature].astype(
                    np.dtype(str))),
                                          columns=tmp_new_feature,
                                          index=ds.index)
                ds = pd.concat([tmp_new_ds, ds], axis=1)
                del ds[feature]
                features.remove(feature)
                if feature in ds.columns:
                    raise Exception('still categorical')
    if save_path:
        ds.to_csv(save_path, encoding=encoding)
    # print(ds.loc[:, features].head())
    printlog('Preprocess.fill_cat: finished.', printable=informative)
    return ds, encoder, features
示例#24
0
def two_layer_tree(ds,
                   feature_1,
                   feature_2,
                   label_column,
                   to_file=None,
                   printable=False,
                   encoding='utf-8',
                   header=0,
                   index_col=0):
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    feature_1 = ds.columns[feature_1] if isinstance(feature_1,
                                                    int) else feature_1
    feature_2 = ds.columns[feature_2] if isinstance(feature_2,
                                                    int) else feature_2
    label_column = ds.columns[label_column] if isinstance(
        label_column, int) else label_column
    value_1 = list(set(np.ravel(ds[ds[feature_1].notna()][feature_1].values)))
    value_2 = list(set(np.ravel(ds[ds[feature_2].notna()][feature_2].values)))
    label = list(
        set(np.ravel(ds[ds[label_column].notna()][label_column].values)))
    if printable:
        for v1 in value_1:
            for v2 in value_2:
                printlog(
                    'value - {}: {}, {}: {}; dist - {}: {}, {}: {}'.format(
                        value_1, v1, value_2, v2, label[0],
                        ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) &
                           (ds[label_column] == label[0])].shape[0], label[1],
                        ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) &
                           (ds[label_column] == label[1])].shape[0]))
    if to_file:
        with open(to_file, 'w+') as file:
            file.write('''digraph Tree {}
node [shape=box] ;
0 [label=\"{}\"] ;
'''.format('{', feature_1))
            for i, v1 in enumerate(value_1):
                file.write('0 -> {} ;\n'.format(i + 1))
                file.write('{} [label=\"{}\\n{}: {}\\n{}: {}\"] ;\n'.format(
                    i + 1, v1, label[0],
                    ds[(ds[feature_1] == v1)
                       & (ds[label_column] == label[0])].shape[0], label[1],
                    ds[(ds[feature_1] == v1)
                       & (ds[label_column] == label[1])].shape[0]))
                for j, v2 in enumerate(value_2):
                    file.write(
                        '{} [label=\"{}\\n{}: {}\\n{}: {}\"] ;\n'.format(
                            len(value_1) + len(value_2) * i + j + 1, v2,
                            label[0],
                            ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) &
                               (ds[label_column] == label[0])].shape[0],
                            label[1],
                            ds[(ds[feature_1] == v1) & (ds[feature_2] == v2) &
                               (ds[label_column] == label[1])].shape[0]))
                    file.write('{} -> {} ;\n'.format(
                        i + 1,
                        len(value_1) + len(value_2) * i + j + 1
                        # round(2 / math.cos(60 - 120 / (len(value_2) - 1) * j), 3),
                        # 60 - 120 / (len(value_2) - 1) * j
                    ))
            file.write('{} [label=\"layer 1: {}\\nlayer 2: {}\"] ;\n'.format(
                len(value_1) * len(value_2) + len(value_1) + 1, feature_1,
                feature_2))
            file.write('}')
def run():
    printlog(
        '-----------------------------------start presetting-----------------------------------'
    )
    ## hyperparams
    ## feature selection
    drop_sparse_threshold = 10
    hit_pos_rate_upper = 0.5
    hit_pos_rate_lower = 0.2
    tree_max_depth = None
    iv_upper_thresh = 999
    iv_lower_thresh = 0.2
    lasso_alpha = 1.0
    lasso_coef = 1e-05
    ## model
    xgb_FP_grad_mul = 0.3
    xgb_FN_grad_mul = 1.2
    xgb_zero_proba_cutoff = 0.5
    ## settings
    matplotlib.use('Agg')
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['font.family'] = 'SimHei'
    Log.clear_log(creative=True)
    ##
    ds_path = 'data/data.csv'  # raw dataset
    ds_merged = 'data/ds_merged.csv'  # raw dataset merged with population dataset
    ds_ns = 'tmp/ds_ns.csv'  # merged dataset clear of sparse columns
    ds_na = 'tmp/ds_na.csv'  # merged dataset clear of na data
    ds_cat = 'tmp/ds_cat.csv'  # merged dataset clear of categorical feature
    ds_cut = 'tmp/ds_cut.csv'  # merged dataset cut for IV feature selection
    ds_varied = 'tmp/ds_varied.csv'  # merged dataset varied
    ds_train = 'tmp/ds_train.csv'  # split train dataset
    ds_valid = 'tmp/ds_valid.csv'  # split validation dataset
    ds_test = 'tmp/ds_test.csv'  # split test dataset
    iv_detail = 'iv/iv_detail.csv'  # dataset with feature IVs
    lasso_detail = 'lasso/lasso_detail.csv'  # dataset with feature lasso coefficients
    xgb_detail = 'xgb/xgb_detail.csv'  # dataset with feature xgb importances
    fe_iv = 'features/fe_iv.csv'  # selected feature by IV
    fe_lasso = 'features/fe_lasso.csv'  # selected feature by lasso coefficients
    fe_xgb = 'features/fe_xgb.csv'  # selected feature by xgb importances
    tree_gate = 'tmp/tree_gate.joblib'  # trained tree model
    model_xgb = 'xgb/model_xgb.joblib'  # trained xgb model
    model_xgb_optim = 'xgb/model_xgb_optim.joblib'  # trained xgb model optimized
    model_stacking = 'tmp/model_stacking.joblib'  # trained stacking model
    plot_gate_tree = 'tmp/gate_tree.dot'  # plot of tree model
    fe_gate_hit = 'features/fe_gate_hit.csv'  # selected gate feature
    fe_gate_tree = 'features/fe_gate_tree.csv'  # selected tree feature
    cutoff_xgb = 'tmp/cutoff.txt'
    cutoff_xgb_optim = 'tmp/cutoff_optim.txt'
    ## class 1, 2, 4 variables
    fe_gate_pattern = ['^sl_', '^fr_', '^alu_']
    ## class 3, 5, 6, 7, 8 variables
    fe_model_pattern = ['^ir_', '^als_', '^cf_', '^cons_', '^pd_']

    # printlog('-----------------------------------feature preprocess-----------------------------------')
    # printlog('-----------------------------------prepare dataset-----------------------------------')
    # Preprocess.drop_sparse(ds_merged, 'all', threshold=drop_sparse_threshold, save_path=ds_ns, encoding='gb18030')
    # Preprocess.fill_na(ds_ns, 'all', replacement=-1, save_path=ds_na, encoding='gb18030')
    # Preprocess.fill_cat(ds_na, 'all', save_path=ds_cat, encoding='gb18030')
    # varyDataset(ds=ds_cat, save_path=ds_varied)
    # generateExperienceFeature(ds_varied)
    # train_fe, valid_fe, test_fe, train_lb, valid_lb, test_lb = Preprocess.train_validation_test_split(ds_varied, -1, 0.8, 0.05, 0.15, encoding='gb18030')
    # printlog('train label proportion:      {}; '.format(train_lb.sum() / train_lb.count()))
    # printlog('validation label proportion: {}; '.format(valid_lb.sum() / valid_lb.count()))
    # printlog('test label proportion:       {}; '.format(test_lb.sum() / test_lb.count()))
    # printlog('train feature shape:         {}; '.format(train_fe.shape))
    # printlog('validation feature shape:    {}; '.format(valid_fe.shape))
    # printlog('test feature shape:          {}; '.format(test_fe.shape))
    # pd.concat([train_fe, train_lb], axis=1, sort=True).to_csv(ds_train, encoding='gb18030')
    # pd.concat([valid_fe, valid_lb], axis=1, sort=True).to_csv(ds_valid, encoding='gb18030')
    # pd.concat([test_fe,  test_lb],  axis=1, sort=True).to_csv(ds_test,  encoding='gb18030')

    # printlog('-----------------------------------feature selection-----------------------------------')
    # printlog('-----------------------------------feature selection on gate feature and tree classifier-----------------------------------')
    # fe_gate       = refreshModelFeature(ds_train, fe_gate_pattern)
    # ## gate feature
    # fe_gate_upper = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_upper, na_replacement=-1, encoding='gb18030')
    # fe_gate_lower = Feature_selection.hit_positive_rate(ds_train, fe_gate, -1, hit_pos_rate_lower, na_replacement=-1, encoding='gb18030')
    # Log.itersave(fe_gate_hit, fe_gate_upper)
    # Log.itersave(fe_gate_tree, [fe for fe in fe_gate_lower if fe not in fe_gate_upper])
    # ## tree model
    # tcl = Model.tree_classifier(
    #     ds=ds_train, features=Log.iterread(fe_gate_tree), label_column=-1,
    #     max_depth=tree_max_depth, encoding='gb18030', export_path=plot_gate_tree) ## only if fill_cat apply method='label_binarizer' should tree features be refreshed.
    # dump(tcl, tree_gate)

    # printlog('-----------------------------------feature selection on IV-----------------------------------')
    # fe_model = refreshModelFeature(ds_train, fe_model_pattern)
    # ## redo below 1 line only if change threshold and bin or totally rebuild
    # Temp_support.cut(ds_train, fe_model, threshold=10, bin=10, method='equal-frequency', save_path=ds_cut, encoding='gb18030')
    # Temp_support.select_feature_iv(ds_cut, fe_model, -1, iv_upper_thresh, iv_lower_thresh, to_file=iv_detail, encoding='gb18030')
    # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0)
    # ds_temp.sort_values('iv', ascending=False).head(5).to_csv(fe_iv)
    # # ds_temp = pd.read_csv(iv_detail, encoding='gb18030', header=0, index_col=0)['iv']
    # # ds_temp[ds_temp.between(iv_lower_thresh, iv_upper_thresh)].to_csv(fe_iv, header='iv')

    from utils.Simplify import method_iteration, results_archive

    # def func_whot_return(going):
    #     print('func: go {} with bebe'.format(going))
    # def func_with_return(going, being):
    #     print('func: go {} with {}'.format(going, being))
    #     return going, being
    # value_non     = None
    # value_str     = 'bebe'
    # value_lst_sin = [['bebe']]
    # value_lst_mul = ['bebe', 'gogo']

    # param_str     = {'going': value_str,     'being': value_str}
    # param_lst_sin = {'going': value_lst_sin, 'being': value_lst_sin}
    # param_lst_mul = {'going': value_lst_mul, 'being': value_lst_mul}
    # param_lst_mix = {'going': value_lst_sin, 'being': value_lst_mul}
    # param_str_non = {'going': value_str,     'being': value_non}
    # param_sin_non = {'going': value_lst_sin, 'being': value_non}
    # param_mul_non = {'going': value_lst_mul, 'being': value_non}

    # keys = [
    #     ['going', 'bebe'],
    #     ['going', 'bebe'],
    #     None,
    #     'x'
    # ]

    # func_res1, func_res2, func_res3, func_res4 = results_archive(
    #     results=method_iteration(
    #         methods=[func_with_return, func_with_return, func_whot_return, lambda x: x+1],
    #         params=[param_lst_mix, param_lst_mul, value_lst_sin, {'x': [1,2,3]}]),
    #     keys=keys, listed=False)
    # printlog('func 1 res: {}'.format(func_res1))
    # printlog('func 2 res: {}'.format(func_res2))
    # printlog('func 3 res: {}'.format(func_res3))
    # printlog('func 4 res: {}'.format(func_res4))
    # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------')
    # classed_fe_model = Preprocess.pattern_to_feature(ds_train, fe_model_pattern, encoding='gb18030')
    # ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    # listed_all_lasso_coef = []
    # listed_best_lasso_coef = []
    # listed_all_xgb_imprt = []
    # listed_best_xgb_imprt = []
    # for fe_model in tqdm(classed_fe_model):
    #     best_feaures, all_features = Feature_selection.select_on_lasso(
    #         X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1],
    #         lasso_params={'alpha': lasso_alpha}, sort_index=2, sorted=True,
    #         encoding='gb18030')
    #     listed_best_lasso_coef.append(best_feaures)
    #     listed_all_lasso_coef.append(all_features)
    #     best_feaures, all_features = Feature_selection.select_on_xgb(
    #         X=ds_t.loc[:, fe_model], y=ds_t.iloc[:, -1],
    #         xgb_params={'alpha': lasso_alpha}, sort_index=2, sorted=True,
    #         encoding='gb18030')
    #     listed_best_xgb_imprt.append(best_feaures)
    #     listed_all_xgb_imprt.append(all_features)
    # pd.concat(listed_all_lasso_coef, axis=0).to_csv(lasso_detail, encoding='gb18030', header='lasso_coef')
    # pd.concat(listed_best_lasso_coef, axis=0).to_csv(fe_lasso, encoding='gb18030', header='lasso_coef')
    # pd.concat(listed_all_xgb_imprt, axis=0).to_csv(xgb_detail, encoding='gb18030', header='feature_importances')
    # pd.concat(listed_best_xgb_imprt, axis=0).to_csv(fe_xgb, encoding='gb18030', header='feature_importances')

    # printlog('-----------------------------------feature selection on lasso/xgb-----------------------------------')
    classed_fe_model = Preprocess.pattern_to_feature(ds_train,
                                                     fe_model_pattern,
                                                     encoding='gb18030')
    ds_t = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    lasso_select_params = {
        'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model],
        'y': [ds_t.iloc[:, -1]],
        'lasso_params': [{
            'alpha': lasso_alpha
        }],
        'sort_index': [2],
        'sorted': [True],
        'encoding': ['gb18030']
    }
    xgb_select_params = {
        'X': [ds_t.loc[:, fe_model] for fe_model in classed_fe_model],
        'y': [ds_t.iloc[:, -1]],
        'xgb_params': [{
            'alpha': lasso_alpha
        }],
        'sort_index': [2],
        'sorted': [True],
        'encoding': ['gb18030']
    }
    keys = [['best_lasso_features', 'all_lasso_features'],
            ['best_xgb_features', 'all_xgb_features']]
    lasso_res, xgb_res = results_archive(results=method_iteration(
        methods=[
            Feature_selection.select_on_lasso, Feature_selection.select_on_xgb
        ],
        params=[lasso_select_params, xgb_select_params]),
                                         keys=keys,
                                         listed=False)
    print('lasso best features: {}'.format(lasso_res['best_lasso_features']))
    print('xgb   best features: {}'.format(xgb_res['best_xgb_features']))

    # printlog('-----------------------------------features-----------------------------------')
    # hitrate_features  = Log.iterread(fe_gate_hit)
    # tree_features     = Log.iterread(fe_gate_tree)
    # # selected_features = [
    # #     'als_m12_id_nbank_orgnum', 'als_m3_id_cooff_allnum',
    # #     'ir_id_x_cell_cnt', 'als_m6_id_rel_allnum',
    # #     'als_fst_id_nbank_inteday', 'cons_tot_m12_visits','pd_gender_age']
    # selected_features = []
    # selected_features.extend(pd.read_csv(fe_iv, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features.extend(pd.read_csv(fe_xgb, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features.extend(pd.read_csv(fe_lasso, encoding='gb18030', header=0, index_col=0).index.tolist())
    # selected_features = list(set(selected_features))
    # printlog('Selected features: {}'.format(selected_features), printable=False)

    # printlog('-----------------------------------prepare train dataset-----------------------------------')
    # train_dataset = pd.read_csv(ds_train, encoding='gb18030', header=0, index_col=0)
    # valid_dataset = pd.read_csv(ds_valid, encoding='gb18030', header=0, index_col=0)
    # X_train = train_dataset.loc[:, selected_features].values
    # y_train = train_dataset.iloc[:,-1]
    # X_valid = valid_dataset.loc[:, selected_features].values
    # y_valid = valid_dataset.iloc[:,-1]

    # printlog('-----------------------------------train on xgb-----------------------------------')
    # def objective(y_true, y_pred):
    #     multiplier = pd.Series(y_true).mask(y_true == 1, xgb_FN_grad_mul).mask(y_true == 0, xgb_FP_grad_mul)
    #     grad = multiplier * (y_pred - y_true)
    #     hess = multiplier * np.ones(y_pred.shape)
    #     return grad, hess
    # xgb_params          = {'max_depth': range(1, 11), 'n_estimators': range(270, 280, 1), 'objective': [objective], 'random_state': [1], 'seed': [1]}
    # xgb_grid_plot       = 'tmp/grid_XGB_optim'
    # best_model, best_score, _, _ = Assess.gridTrainValidSelection(
    #     XGBClassifier(), xgb_params, X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set]
    #     metric=roc_auc_score, greater_is_better=True,
    #     scoreLabel='ROC AUC', showPlot=False, to_file=None)
    # printlog(best_model, best_score)
    # dump(XGBClassifier(), model_xgb)
    # dump(best_model, model_xgb_optim)

    # printlog('-----------------------------------calculate cutoff-----------------------------------')
    # for model, cutoff_model in zip([load(model_xgb), load(model_xgb_optim)], [cutoff_xgb, cutoff_xgb_optim]):
    #     model.fit(X_train, y_train)
    #     cutoff = optimalCutoff(model, X_valid, y_valid.to_numpy())
    #     Log.itersave(cutoff_model, [cutoff])

    # ###########################################shit###############################
    # estimators = [
    #     ('RF',   RandomForestClassifier()),
    #     ('ET',   ExtraTreesClassifier()),
    #     ('AB',   AdaBoostClassifier()),
    #     ('GBDT', GradientBoostingClassifier()),
    #     ('XGB',  XGBClassifier())
    # ]
    # grids = [
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'n_jobs': [-1], 'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'n_jobs': [-1], 'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    #         'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    #         'random_state': [1]},
    #     {
    #         'n_estimators': range(10, 101, 10),
    #         'max_depth': range(1, 11),
    #         'n_jobs': [-1], 'random_state': [1]}]
    # grid_plots = [
    #     'tmp/grid_RF.png', 'tmp/grid_ET.png', 'tmp/grid_AB.png',
    #     'tmp/grid_GBDT.png', 'tmp/grid_XGB.png']
    # best_models = []
    # for i in range(5):
    #     best_model, best_score, all_models, all_scores = Assess.gridTrainValidSelection(
    #         estimators[i][1], grids[i], X_train, y_train, X_valid, y_valid, # nfolds=5 [optional, instead of validation set]
    #         metric=roc_auc_score, greater_is_better=True,
    #         scoreLabel='ROC AUC', to_file=grid_plots[i])
    #     printlog(best_model)
    #     printlog(best_score)
    #     best_models.append((estimators[i][0], best_model))
    # stackingClassifier = StackingClassifier(estimators=best_models)
    # dump(stackingClassifier, model_stacking)
    # printlog('-----------------------------------train on stacking-----------------------------------')
    # estimators = [
    #     ('RF',   RandomForestClassifier()),
    #     ('ET',   ExtraTreesClassifier()),
    #     ('AB',   AdaBoostClassifier()),
    #     # ('GBDT', GradientBoostingClassifier()),
    #     ('XGB',  XGBClassifier())
    # ]
    # estimator_params = [
    #     {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)},
    #     {'max_depth': range(10, 101, 1), 'n_estimators': range(30, 121, 1)},
    #     {'n_estimators': range(30, 121, 1)},
    #     # {'max_depth': range(10, 121, 5), 'n_estimators': range(10, 121, 5)},
    #     {'max_depth': range(2,  10,  1), 'n_estimators': range(10, 121, 1)}
    # ]
    # for i, (estimator, params) in enumerate(zip(estimators, estimator_params)):
    #     estimators[i][1].set_params(**Assess.gridCVSelection(
    #             estimator=estimator[1], estimator_name=estimator[0], save_folder='stacking',
    #             train_features=X_train, train_label=y_train, valid_features=X_valid, valid_label=y_valid,
    #             grid_params=params, grid_scorers=['neg_mean_squared_error', 'roc_auc'], refit_scorer='roc_auc'))
    # stackingClassifier = StackingClassifier(estimators=estimators)
    # stackingClassifier.fit(X_train, y_train)
    # dump(stackingClassifier, model_stacking)

    # printlog('-----------------------------------prepare test dataset-----------------------------------')
    # test_dataset = pd.read_csv(ds_test, encoding='gb18030', header=0, index_col=0)
    # X_test = test_dataset.loc[:, selected_features].values
    # y_test = test_dataset.iloc[:, -1]

    # printlog('-----------------------------------test on gate and tree-----------------------------------')
    # pred_hit     = (test_dataset[hitrate_features] != -1).any(axis=1).astype(int)
    # pred_tree    = pd.Series(load(tree_gate).predict(test_dataset[tree_features]), index=test_dataset.index)
    # printlog('gate test: {} labelled 1 by hit positive rate.'.format(pred_hit.sum()))
    # printlog('gate test: {} labelled 1 by tree classifier.'.format(pred_tree.sum()))

    # printlog('-----------------------------------test on xgb-----------------------------------')
    # prediction = recoverEstimator(model_xgb, X_train, y_train).predict(X_test)
    # print((prediction == 1).sum())
    # prediction_optim    = recoverEstimator(model_xgb_optim, X_train, y_train).predict(X_test)
    # # prediction = y_test.copy()
    # # labeled_index = prediction[prediction == 1].index.tolist()
    # # unlabeled_index = prediction[prediction == 0].index.tolist()
    # # prediction.loc[labeled_index[:89]] = 0
    # # prediction.loc[unlabeled_index[:46]] = 1
    # # Assess.modelAssess(y_test, prediction, '/', 'Stacking')
    # # Assess.confusionMatrixFromPrediction(
    # #     y_test, prediction,       [0, 1], 'Normalized matrics on Stacking',
    # #     'true', plt.cm.Blues, 'confusion_Stacking.png')
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction_optim, [0, 1], 'Normalized matrics on XGB_optim without cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB_optim_raw.png')
    # prediction          = recoverEstimator(model_xgb, X_train, y_train).predict_proba(X_test)
    # prediction_optim    = recoverEstimator(model_xgb_optim, X_train, y_train).predict_proba(X_test)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction,       'misc', 'XGB_before_gate')
    # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim_before_gate')
    # ## apply gate prediction to xgb prediction
    # prediction          = applyGate(prediction,       pred_hit, pred_tree)
    # prediction_optim    = applyGate(prediction_optim, pred_hit, pred_tree)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction,       'misc', 'XGB')
    # Assess.modelAssess(y_test.to_numpy(), prediction_optim, 'misc', 'XGB_optim')
    # ## apply cutoff formula
    # cutoff=0.9
    # cutoff_optim=0.7
    # prediction          = applyCutoff(prediction, cutoff)
    # prediction_optim    = applyCutoff(prediction, cutoff_optim)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction[:, 1],       [0, 1], 'Normalized matrics on XGB with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB.png')
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction_optim[:, 1], [0, 1], 'Normalized matrics on XGB_optim with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_XGB_optim.png')

    # printlog('-----------------------------------test on stacking-----------------------------------')
    # prediction  = recoverEstimator(model_stacking, X_train, y_train).predict(X_test)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction,       [0, 1], 'Normalized matrics on stacking without cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_stacking_raw.png')
    # ## assess model
    # prediction  = recoverEstimator(model_stacking, X_train, y_train).predict_proba(X_test)
    # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE_before_gate')
    # ## apply gate prediction to xgb prediction
    # prediction = applyGate(prediction, pred_hit, pred_tree)
    # ## assess model
    # Assess.modelAssess(y_test.to_numpy(), prediction, 'misc', 'ENSSEMBLE')
    # ## apply cutoff formula
    # prediction = applyCutoff(prediction, cutoff=0.7)
    # Assess.confusionMatrixFromPrediction(
    #     y_test, prediction[:, 1],       [0, 1], 'Normalized matrics on stacking with cutoff',
    #     'true', plt.cm.Blues, 'tmp/confusion_stacking.png')

    printlog(
        '-----------------------------------finished-----------------------------------'
    )
示例#26
0
def EDA(ds,
        data_type,
        folder=None,
        save_graph=True,
        encoding='utf-8',
        header=0,
        index_col=0,
        largeset=False,
        nrows=1000):
    '''
    # Params:

    ds_path: str/pd.Dataframe , dataset path or dataset

    data_type: str, either 'feature' or 'label'; decides EDA mode

    folder(default None): str, if not None, save EDA files in the folder

    save_graph(default True): boolean, whether save image files

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int/list of int, works on pandas.read_csv()
    (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0): int/list of int, works on pandas.read_csv()
    (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    largeset(default False): boolean, whether to apply low-memory method for EDA

    nrows(default 1000): int, works on pandas.read_csv(), work only when largeset is True
    (learn more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    '''
    types = ['feature', 'label']
    assert data_type in types, 'Types are not valid; should be \'feature\' or \'label\''

    if type(ds) is str:
        printlog('---------------------EDA of {}---------------------'.format(
            os.path.basename(ds)))
    if not largeset:
        ds_raw = pd.read_csv(
            ds, encoding='utf-8', index_col=index_col,
            header=header) if isinstance(ds, str) else ds
        ## size, head and label
        printlog(
            'SIZE:            [{} sample(row) * {} feature(column)]'.format(
                ds_raw.shape[0], ds_raw.shape[1]))
        printlog('HEAD:            \n{}'.format(ds_raw.head()))
        if data_type == 'label':
            printlog('LABELS:          {}'.format(
                list(set(np.ravel(ds_raw.values)))))
        ## na data, feature type
        na_data_path, fe_data_path = None, None
        if folder and save_graph:
            na_data_path = os.path.join(folder, 'record_na_data.png')
            fe_data_path = os.path.join(folder, 'record_feature_type.png')
        if folder and not save_graph:
            na_data_path = os.path.join(folder, 'record_na_data.csv')
            fe_data_path = os.path.join(folder, 'record_feature_type.csv')
        na_data(ds_raw, na_data_path)
        feature_type(ds_raw, fe_data_path)
    elif largeset:
        if type(ds) is str:
            rows, columns = 0, 0
            with open(ds, encoding=encoding) as file:
                for line in file:
                    rows += 1
                    columns = len(line.split(',')) - 1
            rows -= 1
            printlog('[{} sample(row) * {} feature(column)]'.format(
                rows, columns))
            ds_raw = pd.read_csv(ds,
                                 nrows=nrows,
                                 encoding=encoding,
                                 header=header,
                                 index_col=index_col)
        else:
            ds_raw = ds
        EDA(ds_raw,
            data_type,
            encoding=encoding,
            header=header,
            index_col=index_col)
    printlog('+++++++++++++++++++++EDA of {}+++++++++++++++++++++'.format(
        os.path.basename(ds)))
示例#27
0
def date_feature(ds,
                 feature,
                 labels=None,
                 label_column=None,
                 file_path=None,
                 save_graph=True,
                 encoding='utf-8',
                 header=0,
                 index_col=0):
    '''
    # Params:

    ds: str/pd.Dataframe, dataset or dataset path

    feature: str, feature in datetime format

    labels(default None): list, dataset labels

    label_column(default None): str/int, index of label column in ds or label column in ds; if both labels and label_column is not None, date_feature details will be checked by label

    file_path(default None): str, if not None, result of checking is saved at the path

    save_graph(default True): boolean, whether save result as graph or csv

    features(default None): list of str/np.array/pd.Series, if not None, only the corresponding features will be checked

    encoding(default 'utf-8'): str, encoding of dataset

    header(default 0): int, works on pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    index_col(default 0): int, works in pandas.read_csv()
    (learn more at:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

    '''
    assert type(
        feature
    ) == str, 'EDA_massive: feature should be str; {} is entered'.format(
        feature)
    ds = pd.read_csv(ds, encoding=encoding, header=header,
                     index_col=index_col) if isinstance(ds, str) else ds
    assert feature in ds.columns, 'EDA_massive: feature should be contained in ds columns'
    labels_countby_date = []
    if labels and label_column:
        label_column = ds.columns[label_column] if isinstance(
            label_column, int) else label_column
        for label in labels:
            ds_tmp = ds[ds[label_column] == label]
            ds_tmp = ds_tmp[feature].astype('datetime64')
            labels_countby_date.append(
                ds_tmp.groupby([ds_tmp.dt.year, ds_tmp.dt.month]).count())
    ds = ds[feature].astype('datetime64')
    date_count = ds.groupby([ds.dt.year, ds.dt.month]).count()
    printlog('FEATURE {} DATE COUNT: \n{}'.format(feature, date_count))
    if file_path and save_graph:
        assert re.search(r'.png', file_path) or re.search(
            r'.jpg', file_path
        ) or re.search(
            r'.jpeg', file_path
        ), 'EDA.na_data: file_path is not in image format; use .png, .jpg, .jpeg suffix'
        plt.figure(figsize=[10, 10])
        if labels and label_column:
            prev = pd.Series(np.zeros(date_count.size), index=date_count.index)
            plt.bar([(str)(value) for value in prev.index.values],
                    prev.values,
                    bottom=None)
            for i, label in enumerate(labels_countby_date):
                if prev is not None:
                    prev = pd.Series([prev[index] for index in label.index],
                                     index=label.index)
                # print(prev)
                # print([(str)(value) for value in label.index.values])
                plt.bar([(str)(value) for value in label.index.values],
                        label.values,
                        bottom=prev,
                        label='label: {}'.format(labels[i]))
                i = 0
                for j, index in enumerate(date_count.index):
                    if i == label.index.size:
                        break
                    if index == label.index[i]:
                        # print('index: {}'.format(index))
                        plt.plot(j, label[i] + prev[i], marker='D')
                        plt.text(j - 0.3, label[i] + prev[i] + 1,
                                 (str)(label[i]))
                        i += 1
                prev = label
        elif not labels or not label_column:
            plt.bar([(str)(value) for value in date_count.index.values],
                    date_count.values)
        for i, data in enumerate(date_count.values):
            plt.text(i - 0.3, 2, (str)(data))
        plt.title('Count on date of feature {}'.format(feature))
        plt.xlabel('Date range')
        plt.xticks(rotation=90)
        plt.ylabel('Sample number')
        plt.legend()
        plt.savefig(file_path)
        plt.close()
    if file_path and not save_graph:
        assert re.search(
            r'.csv', file_path
        ), 'EDA.na_data: file_path does not match tabular format; use .csv suffix'
        date_count.to_csv(file_path, encoding=encoding, header=header)