Exemplos de DataDeal em Python, exemplos de DataDeal em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Attribute_rank.py Projeto: catherinechenn/Credit-management

def Attribut_rank(model):
    data = pd.read_csv('german_credit.csv')
    #    print(data.describe())
    X = data.drop(['default'],axis = 1)
    lable = data['default']
    
    df = Data_Numeric.Data_numerique(X)
    data = DataDeal.get_data(df,lable)
    Train_data,test = train_test_split(data, test_size=0.2,random_state=42)
    
    x_test = test[:,:-1]
    y_test = test[:,-1]
    x_train = Train_data[:,:-1]
    y_train = Train_data[:,-1]
    
    if model=='LSFSVM':
        kernel_dict = {'type': 'RBF','sigma':0.717}
        fuzzyvalue = {'type':'Cen','function':'Lin'}
        
        #clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
        clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
        m = clf._mvalue(x_train, y_train)
    elif model=='FSVM':
        kernel_dict = {'type': 'RBF','sigma':0.717}
        fuzzyvalue = {'type':'Cen','function':'Lin'}
        
        clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
        #clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
        m = clf._mvalue(x_train, y_train)
    elif model=='SVM':
        clf = svm.SVC()
        
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    auc_complete = roc_auc_score(y_test, y_pred)
    
    
    #print(X.columns)
    AUC = []
    for col in X.columns:
    #Only delete one attribut
    #    X_r = X.drop([col],axis=1)
    #    df = Data_Numeric.Data_numerique(X_r)
    #    data = DataDeal.get_data(df,lable)
    #    print(df.columns)
        
    #Use only one attribut
        X_r = pd.DataFrame(X[col])
        lable[lable == 0] = -1
        df = Data_Numeric.Data_numerique(X_r)
        X_r = np.array(df)
        min_max_scaler = preprocessing.MinMaxScaler()
        X_r = min_max_scaler.fit_transform(X_r)
        data = np.append(X_r,lable[:,None],axis=1)
    #    print(df.columns)
    #    
        
        Train_data,test = train_test_split(data, test_size=0.2,random_state=42)
        
        x_test = test[:,:-1]
        y_test = test[:,-1]
        x_train = Train_data[:,:-1]
        y_train = Train_data[:,-1]
       
        if model=='LSFSVM':
            kernel_dict = {'type': 'RBF','sigma':0.717}
            fuzzyvalue = {'type':'Cen','function':'Lin'}
            clf = LS_FSVM.LSFSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
            m = clf._mvalue(x_train, y_train)
            
        elif model=='FSVM':
            kernel_dict = {'type': 'RBF','sigma':0.717}
            fuzzyvalue = {'type':'Cen','function':'Lin'}            
            clf = FSVM.FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
            m = clf._mvalue(x_train, y_train)
            
        elif model=='SVM':
            clf = svm.SVC()
       
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        
        auc = roc_auc_score(y_test, y_pred)
        AUC.append(auc)
     #   print(col , ':', auc)
    
    
    indices = np.argsort(AUC)[::-1]
    featurerank=[]
    for f in range(len(indices)):
        featurerank.append(X.columns[indices[f]])
    
    print('AUC complete',auc_complete)  
      
    plt.figure(figsize=(10,8))
    feature_imp = pd.Series(AUC,index=X.columns).sort_values(ascending=False)
    sns.barplot(x= feature_imp,y=feature_imp.index)
    #plt.vlines(auc_complete,feature_imp.index[19], feature_imp.index[0])
    #plt.xlim((0.65, 0.8))
    plt.xlim((0.4, 0.7))
    plt.xlabel('Feature Importance Score_AUC')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features for SVM")
    plt.legend()
    plt.show()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: GridSearch_parametre.py Projeto: catherinechenn/Credit-management

                if judgment == 'Acc':
                    score = len(y_test[y_predict == y_test]) / test_length
                elif judgment == 'AUC':
                    score = roc_auc_score(y_test, y_predict)
                score_memory.append(score)
                if score > best_score:
                    best_score = score
                    best_parameter = [C, d]

    print('kernel_dict:', kernel_dict_type)
    print('best_parameter', best_parameter)
    return best_parameter


if __name__ == '__main__':
    x_train, y_train, x_test, y_test = DataDeal.get_data()

    fuzzyvalue = {'type': 'Cen', 'function': 'Lin'}
    param_grid = {
        'C': np.logspace(0, 1, 50),
        'sigma': np.logspace(-2, 0.5, 50)
    }

    C = LS_FSVM_best(x_train, y_train, 'LINEAR', param_grid, 'AUC', fuzzyvalue,
                     3 / 4, 1)
    kernel_dict = {'type': 'LINEAR'}

    clf = LS_FSVM.LSFSVM(C, kernel_dict, fuzzyvalue, 3 / 4)
    clf._mvalue(x_train, y_train)
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_test)

Exemplo n.º 3

0

Exibir arquivo

            y_prob[i] = round(y_prob[i],3)
            
        return y_prob
    
    
    def decision_function(self, X):
        return self.y_predict
        



# Test Code for _LSSVMtrain

if __name__ == '__main__':
    
    data = DataDeal.get_data('german_numerical.csv')
    Train_data,test = train_test_split(data, test_size=0.2)
    
    x_test = test[:,:-1]
    y_test = test[:,-1]
    x_train = Train_data[:,:-1]
    y_train = Train_data[:,-1]

    
    kernel_dict = {'type': 'RBF','sigma':0.717}
    fuzzyvalue = {'type':'Cen','function':'Lin'}
    
    clf = FSVM(10,kernel_dict, fuzzyvalue,'origine',4/5)
    m = clf._mvalue(x_train, y_train)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

Exemplo n.º 4

0

Exibir arquivo

            y_prob[i] = round(y_prob[i], 3)

        return y_prob

    def decision_function(self, X):
        return self.y_predict


# Test Code for _LSSVMtrain

if __name__ == '__main__':

    data = pd.read_csv('DF4.csv')
    X = data.drop(['default'], axis=1)
    label = data['default']
    data = DataDeal.get_data(X, label, 'normaliser', scaler='True')
    x = data[:, :-1]
    y = data[:, -1]

    Train_data, test = train_test_split(data, test_size=0.2, random_state=42)

    x_test = test[:, :-1]
    y_test = test[:, -1]
    x_train = Train_data[:, :-1]
    y_train = Train_data[:, -1]

    #    ss=StratifiedShuffleSplit(n_splits=3,test_size=0.2,train_size=0.8, random_state=0)
    #    for train_index, test_index in ss.split(x, y):
    #       x_train, x_test = x[train_index,:], x[test_index,:]#训练集对应的值
    #       y_train, y_test = y[train_index], y[test_index]#类别集对应的值

Exemplo n.º 5

0

Exibir arquivo

def IV_plot(model):
    data = pd.read_csv('german_credit.csv')
    #    print(data.describe())
    X = data.drop(['default'], axis=1)

    if model == 'Origine':

        Y = data['default'].copy()
        Y = Y - 1
        Y[Y == -1] = 1

    elif model == 'FSVM':

        lable = data['default']
        df = Data_Numeric.Data_numerique(X)
        data = DataDeal.get_data(df, lable)
        x = data[:, :-1]
        with open('save/FSVM_Cen_Lin_RBF_Origine.pickle', 'rb') as f:
            clf = pickle.load(f)

        y_pred = clf.predict(x)
        y = y_pred.copy()
        y[y == -1] = 0
        y = y.astype('int64')
        y_df = pd.DataFrame({"Yp": y})
        Y = y_df['Yp']

    elif model == 'LSFSVM':
        lable = data['default']
        df = Data_Numeric.Data_numerique(X)
        data = DataDeal.get_data(df, lable)
        x = data[:, :-1]
        with open('save/LSFSVM_Cen_Lin_RBF_Origine.pickle', 'rb') as f:
            clf = pickle.load(f)

        y_pred = clf.predict(x)
        y = y_pred.copy()
        y[y == -1] = 0
        y = y.astype('int64')
        y_df = pd.DataFrame({"Yp": y})
        Y = y_df['Yp']

    elif model == 'LSFSVM_bagging':
        lable = data['default']
        df = Data_Numeric.Data_numerique(X)
        data = DataDeal.get_data(df, lable)
        x = data[:, :-1]
        with open('save/LSFSVMbag_Cen_Lin_RBF_Origine.pickle', 'rb') as f:
            clf = pickle.load(f)

        y_pred = clf.predict(x)
        y = y_pred.copy()
        y[y == -1] = 0
        y = y.astype('int64')
        y_df = pd.DataFrame({"Yp": y})
        Y = y_df['Yp']

    elif model == 'FSVM_bagging':
        lable = data['default']
        df = Data_Numeric.Data_numerique(X)
        data = DataDeal.get_data(df, lable)
        x = data[:, :-1]
        with open('save/FSVMbag_Cen_Lin_RBF_Origine.pickle', 'rb') as f:
            clf = pickle.load(f)

        y_pred = clf.predict(x)
        y = y_pred.copy()
        y[y == -1] = 0
        y = y.astype('int64')
        y_df = pd.DataFrame({"Yp": y})
        Y = y_df['Yp']

    badnum = len(Y[Y == 0])  # amount of bad clients
    goodnum = Y.count() - badnum  # amount of good clients

    def self_bin_object(X):
        d1 = pd.DataFrame({
            "X": X,
            "Y": Y,
            "Bucket": X
        })  #create a DateFrame X-- attribut ， Y--label ， Bucket--each binning
        d2 = d1.groupby(
            'Bucket',
            as_index=True)  # Group and aggregate according to binning results
        d3 = pd.DataFrame(d2.count(), columns=['good'])
        d3['good'] = d2.sum().Y
        d3['total'] = d2.count().Y
        d3['bad'] = d3['total'] - d3['good']
        d3['rate'] = d2.mean().Y
        d3['woe'] = np.log(
            (d3['bad'] / badnum) /
            (d3['good'] / goodnum))  # calcuate WOE of each binning
        d3['badattr'] = d3[
            'bad'] / badnum  # distribution of bad clients in each binning
        d3['goodattr'] = d3[
            'good'] / goodnum  # distribution of good clients in each binning
        iv = ((d3['badattr'] - d3['goodattr']) *
              d3['woe']).sum()  # calculate Information VAlue
        d4 = (d3.sort_index(by='good')).reset_index(drop=True)  # ranking
        woe = list(d4['woe'].round(3))
        return iv, d3, woe

    def self_bin_numeric(X, cut):
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)})
        d2 = d1.groupby('Bucket', as_index=True)
        d3 = pd.DataFrame(d2.count(), columns=['good'])
        d3['good'] = d2.sum().Y
        d3['total'] = d2.count().Y
        d3['bad'] = d3['total'] - d3['good']
        d3['rate'] = d2.mean().Y
        d3['woe'] = np.log((d3['bad'] / badnum) / (d3['good'] / goodnum))
        d3['badattr'] = d3['bad'] / badnum
        d3['goodattr'] = d3['good'] / goodnum
        iv = ((d3['badattr'] - d3['goodattr']) * d3['woe']).sum()
        #        d4 = (d3.sort_index(by ='good')).reset_index(drop=True)
        d4 = (d3.sort_index(axis=1)).reset_index(drop=True)
        woe = list(d4['woe'].round(3))
        return iv, d3, woe

    iv_fw = self_bin_object(X['foreign_worker'])[0]
    iv_acs = self_bin_object(X['account_check_status'])[0]
    iv_ch = self_bin_object(X['credit_history'])[0]
    iv_pur = self_bin_object(X['purpose'])[0]
    iv_sav = self_bin_object(X['savings'])[0]
    iv_pes = self_bin_object(X['present_emp_since'])[0]
    iv_pss = self_bin_object(X['personal_status_sex'])[0]
    iv_od = self_bin_object(X['other_debtors'])[0]
    iv_pro = self_bin_object(X['property'])[0]
    iv_oip = self_bin_object(X['other_installment_plans'])[0]
    iv_hous = self_bin_object(X['housing'])[0]
    iv_job = self_bin_object(X['job'])[0]
    iv_tele = self_bin_object(X['telephone'])[0]

    iv_iaip = self_bin_object(X['installment_as_income_perc'])[0]
    iv_prs = self_bin_object(X['present_res_since'])[0]
    iv_ctb = self_bin_object(X['credits_this_bank'])[0]
    iv_pum = self_bin_object(X['people_under_maintenance'])[0]

    iv_dim = self_bin_numeric(X['duration_in_month'], 4)[0]
    iv_ca = self_bin_numeric(X['credit_amount'], 5)[0]
    iv_age = self_bin_numeric(X['age'], 5)[0]

    #    print(self_bin_object(X['credits_this_bank']))
    #    print(self_bin_numeric(X['duration_in_month'],4))

    IV = [iv_acs,iv_dim,iv_ch,iv_pur,iv_ca,iv_sav,iv_pes,iv_iaip,iv_pss,iv_od,iv_prs,\
          iv_pro,iv_age,iv_oip,iv_hous,iv_ctb,iv_job,iv_pum,iv_tele,iv_fw]

    indices = np.argsort(IV)[::-1]
    featurerank = []
    for f in range(len(indices)):
        featurerank.append(X.columns[indices[f]])

    plt.figure(figsize=(10, 8))
    feature_imp = pd.Series(IV, index=X.columns).sort_values(ascending=False)

    ivlist = pd.Series(IV).sort_values(ascending=False)
    ivlist.values
    for a, b in zip(ivlist.values, np.arange(0.2, 20.2, 1)):
        plt.text(a, b, round(a, 4))

    #plt.text(ivlist.values[0]-1, 2.2, '1.148')
    sns.barplot(x=feature_imp, y=feature_imp.index)

    #plt.vlines(auc_complete,feature_imp.index[19], feature_imp.index[0])
    plt.xlim((0, 1))
    #plt.ylim((0, 1))
    plt.xlabel('Information Value')
    plt.ylabel('Attribut')
    plt.title("Visualizing Important Features")
    plt.legend()
    plt.show()

Exemplo n.º 6

0

Exibir arquivo

                    y_sample = y_sample

                estimator._mvalue(x_sample, y_sample)
                estimator.fit(x_sample, y_sample)
                y_pred.append(estimator.predict(x_test))

        result = sum(np.array(y_pred))
        result[result >= 1] = 1
        result[result <= -1] = -1

        return result


if __name__ == '__main__':

    data = DataDeal.get_data()
    Train_data, test = train_test_split(data, test_size=0.2)

    x_test = test[:, :-1]
    y_test = test[:, -1]
    x_train = Train_data[:, :-1]
    y_train = Train_data[:, -1]

    kernel_dict = {'type': 'RBF', 'sigma': 0.717}
    fuzzyvalue = {'type': 'Cen', 'function': 'Exp'}

    clf = FSVM.FSVM(3, kernel_dict, fuzzyvalue, 3 / 4)

    bag = Bagging(20, clf, 0.7, 'fsvm', 'UpSampling')
    y_pred = bag.MutModel_clf(x_train, y_train, x_test)

Exemplo n.º 7

0

Exibir arquivo

#    y_predict = clf.predict(X_test)
#    y_test = np.array(y_test)
#    for i in range(len(y_test)):
#        if y_test[i] == 0:
#            y_test[i] = -1
#    print(np.mean(y_predict!=y_test))
#    precision(y_predict,y_test)
#
# if __name__ == '__main__':
#    fsvmTrain('lowSampling')

# Test Code for _LSSVMtrain

if __name__ == "__main__":

    data = DataDeal.get_data("../german_numerical.csv")
    precisionArray = []
    X = data[:, :-1]
    y = data[:, -1]
    #    data = pd.read_csv("../processedData.csv", sep=",", header=0)
    #    # X = applyPcaWithStandardisation(data[data.columns[1:]], 0.9)
    #    X = applyPcaWithNormalisation(data[data.columns[1:]], 0.9)
    #    # X = np.array(data[data.columns[1:]])
    #    y = np.array(data["default"].map({0: -1, 1: 1}))
    #    parameter = grid_search(X,y,kernel='gaussian')
    #    print(ok)
    sss = StratifiedShuffleSplit(n_splits=20, test_size=0.2, random_state=12)
    # sss = StratifiedKFold(n_splits=10, random_state=12, shuffle=True)
    for train, test in sss.split(X, y):
        X_test = X[test]
        y_test = y[test]