Python UnderSampling示例，RandomUnderSampling.UnderSampling Python示例

示例#1

0

显示文件

def Plot_Roc_Undersampling_RF(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix,classificator_name):
    Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000]
    # 18 точек в каждой roc
    

    fprs = []
    tprs = []
    
    for N in Undersampling_levels:
        print('undersampling ', N)

        underSampled_majority_Data = []
        Undersampled_Smoted_Dataset = []

        underSampled_majority_Data = UnderSampling.underSampling(len(MinorityData), MajorityData, N)
            
        Undersampled_Dataset = unite_data(MinorityData, underSampled_majority_Data)

        print('total number of samples ',len(Undersampled_Dataset))     

        #fp, tp, precision = LogReg_c.Classifier(Undersampled_Dataset, numattrs)
        fp, tp = RandomForest_c.Classifier(Undersampled_Dataset, numattrs)
        
        #ДЛЯ КАЖДОГО СЛУЧАЯ ВЫБРАТЬ СВОЙ КЛАССИФИКАТОР ЗАМЕНОЙ ЭТОЙ СТРОКИ И ЛЕГЕНДЫ ДЛЯ ЭТОЙ КРИВОЙ
        if fp == 0 and tp == 0:
            pass
        else:
            fprs.append(fp)
            tprs.append(tp)
            plt.scatter(fp, tp, s=10, c='g',marker='v')
    
    fprs.append(1)
    tprs.append(1)
    '''   
    for j in range(len(fprs)):
        if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]):
            fprs.pop(j)
            tprs.pop(j)
        
    print('points\n',fprs,'\n',tprs)
    '''
    fprs.sort()
    roc_auc = 0
    try:
        roc_auc = metrics.auc(fprs, tprs)
    except Exception:
        pass

    plt.plot(fprs, tprs, color = 'g',label='Under-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1)
   
    plt.rcParams['font.size'] = 12
    plt.title(data_name+' ROC')
    plt.xlabel('% False Positive')
    plt.ylabel('% True Positive')
    plt.xticks([0,0.2,0.4,0.6,0.8,1])
    plt.yticks([0,0.2,0.4,0.6,0.8,1])
    plt.grid(False)
    plt.legend(loc=0)

示例#2

0

显示文件

def Plot_Smote_ROC_LR(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix, classificator_name):
    Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000]
    # 18 точек в каждой roc
    Smote_level = 300 
    #ЛУЧШИЙ ПАРАМЕТР ДЛЯ ЭТОГО КЛАССИФИКАТОРА

    fprs = []
    tprs = []

    Synthetic_minority_Data = SMOTE_c.SMOTE(len(MinorityData), Smote_level, MinorityData, numattrs, minorClassValue, KNN_matrix, 5)
    Final_minority_Data = unite_data(Synthetic_minority_Data, MinorityData)
    final_min_len = len(Final_minority_Data)

    for N in Undersampling_levels:
        print('undersampling ', N)

        underSampled_majority_Data = UnderSampling.underSampling(final_min_len, MajorityData, N)
            
        Undersampled_Smoted_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data)
        print('total number of samples ',len(Undersampled_Smoted_Dataset))     

        fp, tp, precision = LogReg_c.Classifier(Undersampled_Smoted_Dataset, numattrs)
        #fp, tp = RandomForest_c.Classifier(Undersampled_Dataset, numattrs)
        if fp == 0 and tp == 0:
            pass
        else:
            fprs.append(fp)
            tprs.append(tp)
            plt.scatter(fp, tp, s=10, c='b', marker='o')
    
    fprs.append(1)
    tprs.append(1)
    '''
    for j in range(len(fprs)):
        if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]):
            fprs.pop(j)
            tprs.pop(j)
        
    print('points\n',fprs,'\n',tprs)
    '''
    fprs.sort()
    roc_auc = 0
    try:
        roc_auc = metrics.auc(fprs, tprs)
    except Exception:
        pass

    plt.plot(fprs, tprs, color = 'b',label=str(Smote_level)+'-Smote-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1)
  
    plt.rcParams['font.size'] = 12
    plt.title(data_name+' ROC')
    plt.xlabel('% False Positive')
    plt.ylabel('% True Positive')
    plt.xticks([0,0.2,0.4,0.6,0.8,1])
    plt.yticks([0,0.2,0.4,0.6,0.8,1])
    plt.grid(False)
    plt.legend(loc=0)

示例#3

0

显示文件

文件： main.py 项目： harishraman94/SMOTE

def performSampling(dataFile,classColumnNumber,numattrs,minorClassValue,N=400):
    dataSet, MinorityData, MajorityData = getSeparatedSamples(dataFile, classColumnNumber, minorClassValue)
    NumberofMinorSamples = len(MinorityData)

    print("Number of Minor samples present in the Dataset: ", NumberofMinorSamples)

    #Only UnderSampling
    uSamplingDataset = UnderSampling.underSampling(len(MinorityData), dataSet, minorClassValue, numattrs, N)
    createNewDatasetFileUnderSampling(uSamplingDataset)
    C45.treeClassifier2(dataSet, numattrs, 'UnderSampling')

    #UnderSampling and SMOTE
    underSamplingDataset = UnderSampling.underSampling(len(MinorityData), dataSet, minorClassValue, numattrs, N)
    uSDataset = SMOTE.SMOTE(len(MinorityData), N, MinorityData, numattrs, underSamplingDataset, 5, minorClassValue)
    uSmoteDataset = generateFinalSyntheticDataset(underSamplingDataset, uSDataset)
    createNewDatasetFileUnderSamplingSmote(uSmoteDataset)
    C45.treeClassifier2(uSmoteDataset, numattrs, 'SMOTEUndersampling')

    #NaiveBayes
    NaiveBayes.naiveBayes(MajorityData, MinorityData, numattrs)

示例#4

0

显示文件

def Plot_ROC_RandomOverSampling_RF(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,classificator_name):
    Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000]
    # 18 точек в каждой roc
    Oversampling_level = 500 
    #ЛУЧШИЙ ПАРАМЕТР ДЛЯ ЭТОГО КЛАССИФИКАТОРА на этих данных
    Oversampled_minority_Data = []
    
    print('minority ', len(MinorityData))

    N = int(len(MinorityData)*Oversampling_level/100)

    print('to choose from min ',N )
    
    random_Minority_indeces = np.random.choice(range(len(MinorityData)), N, replace=True)
    #берем N произвольных разных образцов меньшего класса

    print('выбрал индексы мин класса')
    
    num = len(random_Minority_indeces)
    
    print ('chosen ', num)

    for index in random_Minority_indeces:
        Oversampled_minority_Data.append(MinorityData[index])

    Final_minority_Data = unite_data(Oversampled_minority_Data, MinorityData)
    final_min_len = len(Final_minority_Data)

    print('total min ', final_min_len)

    fprs = []
    tprs = []
    
    for N in Undersampling_levels:
        print('undersampling ', N)

        underSampled_majority_Data = UnderSampling.underSampling(final_min_len, MajorityData, N)
            
        UnderOversampled_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data)
        print('total number of samples ',len(UnderOversampled_Dataset))     

        fp, tp = RandomForest_c.Classifier(UnderOversampled_Dataset, numattrs)
        if fp == 0 and tp == 0:
            pass
        else:
            fprs.append(fp)
            tprs.append(tp)
            plt.scatter(fp, tp, s=10, c='y', marker='s')
    
    fprs.append(1)
    tprs.append(1)
    '''
    for j in range(len(fprs)):
        if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]):
            fprs.pop(j)
            tprs.pop(j)
        
    print('points\n',fprs,'\n',tprs)
    '''
    fprs.sort()
    roc_auc = 0
    try:
        roc_auc = metrics.auc(fprs, tprs)
    except Exception:
        pass

    plt.plot(fprs, tprs, color = 'y',label=str(Oversampling_level)+'-Over-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1)
  
    plt.rcParams['font.size'] = 12
    plt.title(data_name+' ROC')
    plt.xlabel('% False Positive')
    plt.ylabel('% True Positive')
    plt.xticks([0,0.2,0.4,0.6,0.8,1])
    plt.yticks([0,0.2,0.4,0.6,0.8,1])
    plt.grid(False)
    plt.legend(loc=0)

示例#5

0

显示文件

def Plot_Smote_ROC_parameters_LR(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix,classificator_name):
    #Для замены классификатора изменить две строки в данном коде и последний аргумент при вызове
    
    Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000]
    # 18 точек в каждой roc
    Smote_levels = [100, 200, 300, 400, 500]

    colors = ['r','g','c','b','y']
    markers = ['o', 'v' ,'+', '*','s']
    i = -1
    
    fp, tp, precision = LogReg_c.Classifier(dataSet, numattrs)
    #fp, tp = RandomForest_c.Classifier(dataSet, numattrs)
    if fp == 0 and tp == 0:
        pass
    else:
        plt.plot(fp, tp, 'kx')#классификация на исходных данных


    for SN in Smote_levels:
    #for SN in [100]:
        fprs = []
        tprs = []
        precisions = []
        i += 1

        print('\n Smote number level ', SN)

        Final_minority_Data =[] 
        Synthetic_minority_Data = []

        Synthetic_minority_Data = SMOTE_c.SMOTE(len(MinorityData), SN, MinorityData, numattrs, minorClassValue, KNN_matrix, 5)
        Final_minority_Data = unite_data(Synthetic_minority_Data, MinorityData)
        f_min_len = len(Final_minority_Data)

        for N in Undersampling_levels:
            print('undersampling ', N)

            underSampled_majority_Data = []
            Undersampled_Smoted_Dataset = []

            underSampled_majority_Data = UnderSampling.underSampling(f_min_len, MajorityData, N)
            
            Undersampled_Smoted_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data)
            print('total number of samples ',len(Undersampled_Smoted_Dataset))     

            fp, tp, precision = LogReg_c.Classifier(Undersampled_Smoted_Dataset, numattrs)
            #fp, tp = RandomForest_c.Classifier(Undersampled_Smoted_Dataset, numattrs)

            if fp == 0 and tp == 0:
                pass
            else:
                fprs.append(fp)
                tprs.append(tp)
            
                plt.scatter(fp, tp, s=10, c=colors[i],marker=markers[i])
    
        fprs.append(1)
        tprs.append(1)
        
        #print('points',fprs, '\n', tprs)

        #fprs, tprs = clean_points(fprs, tprs)        
        #print('points\n',fprs,'\n',tprs)
        #fprs.sort()
        #print('points',fprs, '\n', tprs)
        for j in range(len(fprs)):
            if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]):
                fprs.pop(j)
                tprs.pop(j)
        

        roc_auc = 0
        try:
            roc_auc = metrics.auc(fprs, tprs)
        except Exception:
            pass

        plt.plot(fprs, tprs, color = colors[i],label=str(SN)+'-Smote, AUC = '+"%.4f" % roc_auc,lw=1)
    
    plt.rcParams['font.size'] = 12
    plt.title(data_name+' ROC curves with '+classificator_name)
    plt.xlabel('% False Positive')
    plt.ylabel('% True Positive')
    plt.xticks([0,0.2,0.4,0.6,0.8,1])
    plt.yticks([0,0.2,0.4,0.6,0.8,1])
    plt.grid(False)
    plt.legend(loc=0)

    plt.savefig(data_name+' Smote_parameters ROC with '+classificator_name+'.png')
    #plt.savefig('ROC curves for '+classificator_type+' classificator'+'.pdf')
    plt.show()




    ''' Uncomment this and replace plotting line to plot the precision-recall curve with LogReg