def Plot_Roc_Undersampling_RF(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix,classificator_name): Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000] # 18 точек в каждой roc fprs = [] tprs = [] for N in Undersampling_levels: print('undersampling ', N) underSampled_majority_Data = [] Undersampled_Smoted_Dataset = [] underSampled_majority_Data = UnderSampling.underSampling(len(MinorityData), MajorityData, N) Undersampled_Dataset = unite_data(MinorityData, underSampled_majority_Data) print('total number of samples ',len(Undersampled_Dataset)) #fp, tp, precision = LogReg_c.Classifier(Undersampled_Dataset, numattrs) fp, tp = RandomForest_c.Classifier(Undersampled_Dataset, numattrs) #ДЛЯ КАЖДОГО СЛУЧАЯ ВЫБРАТЬ СВОЙ КЛАССИФИКАТОР ЗАМЕНОЙ ЭТОЙ СТРОКИ И ЛЕГЕНДЫ ДЛЯ ЭТОЙ КРИВОЙ if fp == 0 and tp == 0: pass else: fprs.append(fp) tprs.append(tp) plt.scatter(fp, tp, s=10, c='g',marker='v') fprs.append(1) tprs.append(1) ''' for j in range(len(fprs)): if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]): fprs.pop(j) tprs.pop(j) print('points\n',fprs,'\n',tprs) ''' fprs.sort() roc_auc = 0 try: roc_auc = metrics.auc(fprs, tprs) except Exception: pass plt.plot(fprs, tprs, color = 'g',label='Under-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1) plt.rcParams['font.size'] = 12 plt.title(data_name+' ROC') plt.xlabel('% False Positive') plt.ylabel('% True Positive') plt.xticks([0,0.2,0.4,0.6,0.8,1]) plt.yticks([0,0.2,0.4,0.6,0.8,1]) plt.grid(False) plt.legend(loc=0)
def Plot_Smote_ROC_LR(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix, classificator_name): Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000] # 18 точек в каждой roc Smote_level = 300 #ЛУЧШИЙ ПАРАМЕТР ДЛЯ ЭТОГО КЛАССИФИКАТОРА fprs = [] tprs = [] Synthetic_minority_Data = SMOTE_c.SMOTE(len(MinorityData), Smote_level, MinorityData, numattrs, minorClassValue, KNN_matrix, 5) Final_minority_Data = unite_data(Synthetic_minority_Data, MinorityData) final_min_len = len(Final_minority_Data) for N in Undersampling_levels: print('undersampling ', N) underSampled_majority_Data = UnderSampling.underSampling(final_min_len, MajorityData, N) Undersampled_Smoted_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data) print('total number of samples ',len(Undersampled_Smoted_Dataset)) fp, tp, precision = LogReg_c.Classifier(Undersampled_Smoted_Dataset, numattrs) #fp, tp = RandomForest_c.Classifier(Undersampled_Dataset, numattrs) if fp == 0 and tp == 0: pass else: fprs.append(fp) tprs.append(tp) plt.scatter(fp, tp, s=10, c='b', marker='o') fprs.append(1) tprs.append(1) ''' for j in range(len(fprs)): if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]): fprs.pop(j) tprs.pop(j) print('points\n',fprs,'\n',tprs) ''' fprs.sort() roc_auc = 0 try: roc_auc = metrics.auc(fprs, tprs) except Exception: pass plt.plot(fprs, tprs, color = 'b',label=str(Smote_level)+'-Smote-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1) plt.rcParams['font.size'] = 12 plt.title(data_name+' ROC') plt.xlabel('% False Positive') plt.ylabel('% True Positive') plt.xticks([0,0.2,0.4,0.6,0.8,1]) plt.yticks([0,0.2,0.4,0.6,0.8,1]) plt.grid(False) plt.legend(loc=0)
def performSampling(dataFile,classColumnNumber,numattrs,minorClassValue,N=400): dataSet, MinorityData, MajorityData = getSeparatedSamples(dataFile, classColumnNumber, minorClassValue) NumberofMinorSamples = len(MinorityData) print("Number of Minor samples present in the Dataset: ", NumberofMinorSamples) #Only UnderSampling uSamplingDataset = UnderSampling.underSampling(len(MinorityData), dataSet, minorClassValue, numattrs, N) createNewDatasetFileUnderSampling(uSamplingDataset) C45.treeClassifier2(dataSet, numattrs, 'UnderSampling') #UnderSampling and SMOTE underSamplingDataset = UnderSampling.underSampling(len(MinorityData), dataSet, minorClassValue, numattrs, N) uSDataset = SMOTE.SMOTE(len(MinorityData), N, MinorityData, numattrs, underSamplingDataset, 5, minorClassValue) uSmoteDataset = generateFinalSyntheticDataset(underSamplingDataset, uSDataset) createNewDatasetFileUnderSamplingSmote(uSmoteDataset) C45.treeClassifier2(uSmoteDataset, numattrs, 'SMOTEUndersampling') #NaiveBayes NaiveBayes.naiveBayes(MajorityData, MinorityData, numattrs)
def Plot_ROC_RandomOverSampling_RF(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,classificator_name): Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000] # 18 точек в каждой roc Oversampling_level = 500 #ЛУЧШИЙ ПАРАМЕТР ДЛЯ ЭТОГО КЛАССИФИКАТОРА на этих данных Oversampled_minority_Data = [] print('minority ', len(MinorityData)) N = int(len(MinorityData)*Oversampling_level/100) print('to choose from min ',N ) random_Minority_indeces = np.random.choice(range(len(MinorityData)), N, replace=True) #берем N произвольных разных образцов меньшего класса print('выбрал индексы мин класса') num = len(random_Minority_indeces) print ('chosen ', num) for index in random_Minority_indeces: Oversampled_minority_Data.append(MinorityData[index]) Final_minority_Data = unite_data(Oversampled_minority_Data, MinorityData) final_min_len = len(Final_minority_Data) print('total min ', final_min_len) fprs = [] tprs = [] for N in Undersampling_levels: print('undersampling ', N) underSampled_majority_Data = UnderSampling.underSampling(final_min_len, MajorityData, N) UnderOversampled_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data) print('total number of samples ',len(UnderOversampled_Dataset)) fp, tp = RandomForest_c.Classifier(UnderOversampled_Dataset, numattrs) if fp == 0 and tp == 0: pass else: fprs.append(fp) tprs.append(tp) plt.scatter(fp, tp, s=10, c='y', marker='s') fprs.append(1) tprs.append(1) ''' for j in range(len(fprs)): if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]): fprs.pop(j) tprs.pop(j) print('points\n',fprs,'\n',tprs) ''' fprs.sort() roc_auc = 0 try: roc_auc = metrics.auc(fprs, tprs) except Exception: pass plt.plot(fprs, tprs, color = 'y',label=str(Oversampling_level)+'-Over-'+classificator_name+', AUC = '+"%.4f" % roc_auc,lw=1) plt.rcParams['font.size'] = 12 plt.title(data_name+' ROC') plt.xlabel('% False Positive') plt.ylabel('% True Positive') plt.xticks([0,0.2,0.4,0.6,0.8,1]) plt.yticks([0,0.2,0.4,0.6,0.8,1]) plt.grid(False) plt.legend(loc=0)
def Plot_Smote_ROC_parameters_LR(dataSet,MinorityData,MajorityData,numattrs,minorClassValue, data_name,KNN_matrix,classificator_name): #Для замены классификатора изменить две строки в данном коде и последний аргумент при вызове Undersampling_levels = [10, 15,25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500, 600, 700, 800, 1000, 2000] # 18 точек в каждой roc Smote_levels = [100, 200, 300, 400, 500] colors = ['r','g','c','b','y'] markers = ['o', 'v' ,'+', '*','s'] i = -1 fp, tp, precision = LogReg_c.Classifier(dataSet, numattrs) #fp, tp = RandomForest_c.Classifier(dataSet, numattrs) if fp == 0 and tp == 0: pass else: plt.plot(fp, tp, 'kx')#классификация на исходных данных for SN in Smote_levels: #for SN in [100]: fprs = [] tprs = [] precisions = [] i += 1 print('\n Smote number level ', SN) Final_minority_Data =[] Synthetic_minority_Data = [] Synthetic_minority_Data = SMOTE_c.SMOTE(len(MinorityData), SN, MinorityData, numattrs, minorClassValue, KNN_matrix, 5) Final_minority_Data = unite_data(Synthetic_minority_Data, MinorityData) f_min_len = len(Final_minority_Data) for N in Undersampling_levels: print('undersampling ', N) underSampled_majority_Data = [] Undersampled_Smoted_Dataset = [] underSampled_majority_Data = UnderSampling.underSampling(f_min_len, MajorityData, N) Undersampled_Smoted_Dataset = unite_data(Final_minority_Data, underSampled_majority_Data) print('total number of samples ',len(Undersampled_Smoted_Dataset)) fp, tp, precision = LogReg_c.Classifier(Undersampled_Smoted_Dataset, numattrs) #fp, tp = RandomForest_c.Classifier(Undersampled_Smoted_Dataset, numattrs) if fp == 0 and tp == 0: pass else: fprs.append(fp) tprs.append(tp) plt.scatter(fp, tp, s=10, c=colors[i],marker=markers[i]) fprs.append(1) tprs.append(1) #print('points',fprs, '\n', tprs) #fprs, tprs = clean_points(fprs, tprs) #print('points\n',fprs,'\n',tprs) #fprs.sort() #print('points',fprs, '\n', tprs) for j in range(len(fprs)): if (j <= len(fprs)-2) and (fprs[j] > fprs[j+1]): fprs.pop(j) tprs.pop(j) roc_auc = 0 try: roc_auc = metrics.auc(fprs, tprs) except Exception: pass plt.plot(fprs, tprs, color = colors[i],label=str(SN)+'-Smote, AUC = '+"%.4f" % roc_auc,lw=1) plt.rcParams['font.size'] = 12 plt.title(data_name+' ROC curves with '+classificator_name) plt.xlabel('% False Positive') plt.ylabel('% True Positive') plt.xticks([0,0.2,0.4,0.6,0.8,1]) plt.yticks([0,0.2,0.4,0.6,0.8,1]) plt.grid(False) plt.legend(loc=0) plt.savefig(data_name+' Smote_parameters ROC with '+classificator_name+'.png') #plt.savefig('ROC curves for '+classificator_type+' classificator'+'.pdf') plt.show() ''' Uncomment this and replace plotting line to plot the precision-recall curve with LogReg