def test_sample_kmeans_density_estimation(data, density_exponent, cluster_balance_threshold): X, y = data smote = KMeansSMOTE(random_state=42, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold) smote.fit_sample(X, y)
def test_sample_kmeans_not_enough_clusters(): rng = np.random.RandomState(42) X = rng.randn(30, 2) y = np.array([1] * 20 + [0] * 10) smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2) with pytest.raises(RuntimeError): smote.fit_resample(X, y)
def test_kmeans_smote_param_error(data, density_exponent, cluster_balance_threshold): X, y = data kmeans_smote = KMeansSMOTE( density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) with pytest.raises(ValueError, match="should be 'auto' when a string"): kmeans_smote.fit_resample(X, y)
def sample(xtrain, ytrain): sm = KMeansSMOTE(random_state=42) x_res, y_res = sm.fit_resample(xtrain, ytrain) y = y_res x_res = pd.DataFrame(x_res) #y_res=pd.DataFrame(y_res) x_res.columns = xtrain.columns #y_res.columns=["Leak_type"] return x_res, y_res
def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold): X, y = make_classification( n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42 ) smote = KMeansSMOTE( random_state=0, density_exponent=density_exponent, cluster_balance_threshold=cluster_balance_threshold, ) smote.fit_resample(X, y)
def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data kmeans_smote = KMeansSMOTE(random_state=42, kmeans_estimator=kmeans_estimator, k_neighbors=k_neighbors) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) assert X_resampled.shape == (24, 2) assert y_resampled.shape == (24, ) assert kmeans_smote.nn_k_.n_neighbors == 3 assert kmeans_smote.kmeans_estimator_.n_clusters == 3
def over_sample_data(matrix, y_train): add_to_log('Over Sampling') add_to_log('Sample distribution %s' % Counter(y_train)) b_line = KMeansSMOTE(k_neighbors=5, sampling_strategy='not majority', n_jobs=-1, random_state=3, kmeans_estimator=100) matrix_resampled, y_resampled = b_line.fit_resample(matrix, y_train) add_to_log('Resample distribution %s' % Counter(y_resampled)) return matrix_resampled, y_resampled
def keans_smote(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sm = KMeansSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_kmeans_smote(data): X, y = data kmeans_smote = KMeansSMOTE(kmeans_estimator=1, random_state=42, cluster_balance_threshold=0.0, k_neighbors=5) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y) X_res_2, y_res_2 = smote.fit_sample(X, y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) assert kmeans_smote.nn_k_.n_neighbors == 6 assert kmeans_smote.kmeans_estimator_.n_clusters == 1 assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params()
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def over_sample(self, method="BorderLine", sampling_strategy="minority", random_state=42, k_neighbors=5, n_neighbors=10, kind="borderline-1"): """ 过采样方法 :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m} :param random_state:int :param k_neighbors:int :param n_neighbors:int :param kind:str, borderline-1,borderline-2 :return:df """ feature_name = self._df.columns.difference(["id", self._target]).tolist() X = self._df[feature_name].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "ADASYN": overSm = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors) elif method == "BorderLine": overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, kind=kind) elif method == "KMeans": overSm = KMeansSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors) elif method == "Random": overSm = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state) elif method == "SVM": overSm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, out_step=0.5) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = overSm.fit_resample(X, y) print("overSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=feature_name + [self._target]) return df_new
def __get_smote(self): if self.algorithm == 'Borderline': return BorderlineSMOTE(random_state=RANDOM_STATE) elif self.algorithm == 'KMeans': return KMeansSMOTE(random_state=RANDOM_STATE, kmeans_estimator=KMeans(n_clusters=20)) elif self.algorithm == 'SVM': return SVMSMOTE(random_state=RANDOM_STATE) elif self.algorithm == 'Tomek': return SMOTETomek(random_state=RANDOM_STATE) return SMOTE(random_state=RANDOM_STATE)
def equalize_training_dataset_with_KMeansSMOTE(x_train, y_train): from imblearn.over_sampling import KMeansSMOTE old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = KMeansSMOTE( sampling_strategy='not majority', n_jobs=8, cluster_balance_threshold=0.009).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def runSmote(X, y, algorithm='default', split_synthetic=False, verbose=True): if verbose: log.info("Data before oversampling") log.info("Dataset: {0}, {1}".format(X.shape, len(y))) n_casos = np.count_nonzero(y == 1) n_controles = np.count_nonzero(y == 0) N = abs(n_casos - n_controles) if algorithm == 'Borderline': if verbose: log.info("Running Borderline Smote") X_novo, y_novo = BorderlineSMOTE( random_state=random_state).fit_resample(X, y) elif algorithm == 'KMeans': if verbose: log.info("Running KMeans Smote") X_novo, y_novo = KMeansSMOTE( random_state=random_state, kmeans_estimator=KMeans(n_clusters=20)).fit_resample(X, y) elif algorithm == 'SVM': if verbose: log.info("Running SVM Smote") X_novo, y_novo = SVMSMOTE(random_state=random_state).fit_resample(X, y) elif algorithm == 'Tomek': if verbose: log.info("Running Smote Tomek") X_novo, y_novo = SMOTETomek(random_state=random_state).fit_resample( X, y) else: if verbose: log.info("Running default Smote") X_novo, y_novo = SMOTE(random_state=random_state).fit_resample(X, y) if verbose: log.info("Data after oversampling") log.info("Dataset: {0}, {1}".format(X_novo.shape, len(y_novo))) if split_synthetic: synthetic_X = X_novo[-N:] synthetic_y = y_novo[-N:] return X, y, synthetic_X, synthetic_y else: return X_novo, y_novo, None, None
def over_sample(X, y, sampler="RandomUnderSampler"): samplers = { "RandomOverSampler": RandomOverSampler(), "KMeansSMOTE": KMeansSMOTE(), "ADASYN": ADASYN(), "SMOTE": SMOTE(), "BorderlineSMOTE": BorderlineSMOTE(), "SVMSMOTE": SVMSMOTE(), "SMOTENC": SMOTENC(categorical_features=[]), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def target_training_data(targetclass): ##### target_training_data((targetclass)) is meant for genearing second half for training data set ##### generation of evalation data set in outsourced to all_target_training_data(Nnofs,Nnofs_evaluate,fractrain): import dictionary dictionary=dictionary.dict print(' ') # print('working on training set:') # print('targetclass=',targetclass) print('Resampling training set for class', targetclass) X1=3;X2=40 ##### components in the high-dimensional data point to be displayed for visualisation #dict_classes=gen_dictionary.gen_dictionary() #dict_classes=dictionary classes=[ keys for keys in dictionary ] classdir=classes traincontainer=[];traincontainer_y=[] origcontainer_y=[];origcontainer=[] origcontainer_ynn=[]; traincontainer_ynn=[] appendsecondhalf=[] lensh=0 for i in range(len(classdir)): cl=classdir[i] dirinclass=os.listdir(cl) lendirinclass=len(dirinclass) dirinclass=[ os.path.join(cl,dirinclass[i],'ta.npy') for i in range(lendirinclass) ] #print('i=',i,'cl=',cl, 'lendirinclass=',lendirinclass) #################### targetclass ############ fnorig=str(targetclass)+'.orig.npy' fnorig_y=str(targetclass)+'_y.orig.npy' fnorig_ynn=str(targetclass)+'_ynn.orig.npy' fntrain=str(targetclass)+'.train.npy' fntrain_y=str(targetclass)+'_y.train.npy' fntrain_ynn=str(targetclass)+'_ynn.train.npy' if cl==targetclass: # print('{i, cl }=',{i,cl}) # print('in target class',targetclass) #print('dirinclass=',dirinclass) shuffle(dirinclass) firsthalf=dirinclass[0:int(fractrain*len(dirinclass))] secondhalf=dirinclass[int(fractrain*len(dirinclass)):] appendsecondhalf.append(secondhalf) # print('firsthalf=',firsthalf) # print('secondhalf=',secondhalf) ##################################### for k in range(len(firsthalf)): # print('to append', firsthalf[k],'into',fnorig) datain=np.load(firsthalf[k]) origcontainer.append(datain) # print('to append', 0,'into',fnorig_y) origcontainer_y.append(0) origcontainer_ynn.append([0,1]) # print('print from firsthalf:') # print('k:',k,'targetclass:',targetclass,'dictionary[targetclass]:',dictionary[targetclass]) for k in range(len(secondhalf)): # print('to append', secondhalf[k],'into',fntrain) datain=np.load(secondhalf[k]) traincontainer.append(datain) # print('to append',0,'into',fntrain_y) traincontainer_y.append(0) traincontainer_ynn.append([0,1]) ##################################### else: # print('{i, cl }=',{i,cl}) # print('classes other than targetclass',targetclass) for k in range(len(dirinclass)): nnpy=dirinclass[k] if os.path.isfile(nnpy): datain=np.load(nnpy) # print('to append', nnpy,'into',fntrain) traincontainer.append(datain) # print('to append', '1','into',fntrain_y) traincontainer_y.append(1) traincontainer_ynn.append([0,1]) origcontainer=np.array(origcontainer) origcontainer_y=np.array(origcontainer_y) traincontainer=np.array(traincontainer) traincontainer_y=np.array(traincontainer_y) origcontainer_ynn=np.array(origcontainer_ynn) traincontainer_ynn=np.array(traincontainer_ynn) # np.save(fnorig, origcontainer) # np.save(fnorig_y,origcontainer_y) # np.save(fnorig_ynn,origcontainer_ynn) np.save(fntrain, traincontainer) np.save(fntrain_y,traincontainer_y) np.save(fntrain_ynn,traincontainer_ynn) #################### end of targetclass ############ #################################################### #print('Begin of oversampling for trainning set ') #k=len(classdir); k=2; seed=10 X = traincontainer y = traincontainer_y #ynn = traincontainer_ynn #print('1',X.shape) X = np.reshape(X, (X.shape[0], X.shape[2]*X.shape[2])) #print('2',X.shape) ####### scatter plot of X and y #plt.xlabel('x') #plt.ylabel('y') #plt.scatter(X[:, X1], X[:, X2], marker='o', # c=y, s=25, edgecolor='k', cmap=plt.cm.coolwarm) #plt.show() #### creating sampling_strategy ##### #lensh=max(lensh,len(secondhalf)) #print('maxlensh ======== ',lensh) sampling_strategy={} #sampling_strategy[0]=Nnofs*list(y).count(0) #sampling_strategy[0]=Nnofs*list(y).count(1) #sampling_strategy[1]=Nnofs*list(y).count(1) print('npycountt:',npycountt) sampling_strategy[0]=Nnofs*npycountt sampling_strategy[1]=Nnofs*npycountt print('sampling_strategy (training set) = ',sampling_strategy) # print("counter before oversampling = ", sorted(Counter(y).items())) ##### implementing oversampling #### if sampler_train == 'SMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res= SMOTE(sampling_strategy=sampling_strategy, k_neighbors=k-1, random_state=seed,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'BorderlineSMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res=imblearn.over_sampling.BorderlineSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs) \ .fit_resample(X, y) if sampler_train == 'ADASYN': k=3;seed=10;n_jobs=-1; X_res, y_res = ADASYN(random_state=seed,sampling_strategy=sampling_strategy,n_neighbors=k+1,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'KMeansSMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res = KMeansSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k+2,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'RandomOverSampler': k=2;seed=10 X_res, y_res = RandomOverSampler(sampling_strategy=sampling_strategy,random_state=seed)\ .fit_resample(X, y) if sampler_train == 'SVMSMOTE': k=4 m_neighbors=2*k n_jobs=-1;seed=10; X_res, y_res = SVMSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs)\ .fit_resample(X, y) #### implementing oversampling #### y_resnn = [ [y_res[i], np.abs((y_res[i]**(1) - 1))] for i in range(len(y_res))] #plt.xlabel('x') #plt.ylabel('y') #plt.scatter(X_res[:, X1], X_res[:, X2], marker='o', # c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm) #plt.show() # print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())) # print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items())) dim=int(X_res.shape[1]**0.5) X_res=X_res.reshape(X_res.shape[0],dim,dim) ### report sizes of data before and after oversampling # norig=sum([ Counter(y)[keys] for keys in Counter(y) ]) # print('Total number of data before oversampling:',norig) # novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ]) # print('Total number of data after oversampling:', novsp) # print('Ratio of number of data after and before oversampling of trainning data:',novsp/norig) ### here print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())[0]) norigsample=sorted(Counter(y).items())[0][1] print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items())) norig=sum([ Counter(y)[keys] for keys in Counter(y) ]) print('Total number of data before oversampling:',norigsample) #novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ]) novsp=Counter(y_res)[0] print('Total number of data after oversampling:', novsp) print('Ratio of number of data after and before oversampling of trainning data:',novsp/norigsample) ### end here ### save oversampled data fnres=targetclass+'_ovsp.train.npy' fnres_y=targetclass + '_y_ovsp.train.npy' fnres_ynn=targetclass + '_ynn_ovsp.train.npy' np.save(fnres_y, y_res) np.save(fnres_ynn, y_resnn) np.save(fnres, X_res) #################################################### return firsthalf,appendsecondhalf
def kmeans_smote(x, y): print("----KMeans SMOTE----") sampler = KMeansSMOTE(random_state=42) X, y = sampler.fit_sample(x, y) return X, y
# the KMeans version will make a clustering before to generate samples in each # cluster independently depending each cluster density. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(5, 2, figsize=(15, 30)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) for ax, sampler in zip( ax_arr, (SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind='borderline-1'), BorderlineSMOTE(random_state=0, kind='borderline-2'), KMeansSMOTE(random_state=0), SVMSMOTE(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title('Decision function for {}'.format( sampler.__class__.__name__)) plot_resampling(X, y, sampler, ax[1]) ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__)) fig.tight_layout() ############################################################################### # When dealing with a mixed of continuous and categorical features, SMOTE-NC # is the only method which can handle this case. # create a synthetic data set with continuous and categorical features rng = np.random.RandomState(42)
def keans_smote(X, y): sm = KMeansSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) return X_res, y_res
def test_sample_kmeans_not_enough_clusters(data): X, y = data smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42) with pytest.raises(RuntimeError): smote.fit_resample(X, y)
ids=["borderline", "svm"]) def test_smote_m_neighbors(numerical_data, smote): # check that m_neighbors is properly set. Regression test for: # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 X, y = numerical_data _ = smote.fit_resample(X, y) assert smote.nn_k_.n_neighbors == 6 assert smote.nn_m_.n_neighbors == 11 @pytest.mark.parametrize( "smote, neighbor_estimator_name", [ (ADASYN(random_state=0), "n_neighbors"), (BorderlineSMOTE(random_state=0), "k_neighbors"), (KMeansSMOTE(random_state=1), "k_neighbors"), (SMOTE(random_state=0), "k_neighbors"), (SVMSMOTE(random_state=0), "k_neighbors"), ], ids=["adasyn", "borderline", "kmeans", "smote", "svm"], ) def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name): X, y = numerical_data params = { neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5), } smote.set_params(**params) X_res, _ = smote.fit_resample(X, y) assert X_res.shape[0] >= 120
print(y.value_counts()) y = np.ravel(y) print(y.shape) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42) # forest.fit(X_train, y_train) # print("Original set\n{}".format(classification_report(y_test, forest.predict(X_test)))) pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X_train) # Apply the random over-sampling ada = KMeansSMOTE(random_state=42) X_resampled, y_resampled = ada.fit_sample(X_train, y_train) y_resampled = np.ravel(y_resampled) forest.fit(X_resampled, y_resampled) print(Counter(y_resampled)) print(y_resampled.shape) X_res_vis = pca.transform(X_resampled) print("KMeansSMOTE\n{}".format( classification_report(y_test, forest.predict(X_test)))) f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y_train == 0, 0], X_vis[y_train == 0, 1], label="Class #0",
y_resampled, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict)) from imblearn.over_sampling import BorderlineSMOTE sm = BorderlineSMOTE(random_state=2020) X_res, y_res = sm.fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict)) from imblearn.over_sampling import KMeansSMOTE sm = KMeansSMOTE(random_state=2020, cluster_balance_threshold=0.1) X_res, y_res = sm.fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict)) from imblearn.over_sampling import SVMSMOTE sm = SVMSMOTE(random_state=2020) X_res, y_res = sm.fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test)
'''Optimización SMOTE''' best_params_smote = svc_param_selection(Xtrain, ytrain, 5) SVM_smote = svm.SVC(kernel='rbf', C=best_params_smote['C'], gamma=best_params_smote['gamma'], class_weight='balanced') print('valor c ideal SVM SMOTE', best_params_smote['C'], 'valor gamma ideal SVM SMOTE', best_params_smote['gamma']) border_sm = BorderlineSMOTE(k_neighbors=27, random_state=91, sampling_strategy=1) sm = SVMSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, svm_estimator=SVM_smote) ada = ADASYN(random_state=91, n_neighbors=27, sampling_strategy=1, n_jobs=6) Kmeans = KMeansSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, n_jobs=6, kmeans_estimator=MiniBatchKMeans(n_clusters=20)) '''Muestreo Sintetico''' #Xtrain, ytrain = SMOTE().fit_resample(Xtrain, ytrain) Xtrain, ytrain = border_sm.fit_resample(Xtrain, ytrain) '''Selección de caracteristicas''' # rel_MI = SelectKBest(score_func=score_func, k=num_features) # Xtrain = rel_MI.fit_transform(Xtrain, ytrain) # Xtest = rel_MI.transform(Xtest) # rel_MI_support = rel_MI.get_support() # rel_MI_feature = X_frame.loc[:, rel_MI_support].columns.tolist() # rel_MI_scores = rel_MI.scores_[rel_MI_support].tolist() # feature_selection_df = pd.DataFrame({'Feature': rel_MI_feature, 'Score':rel_MI_scores})
adaBoost.fit(X_train, y_train) res = adaBoost.predict(features[test_index]) bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # Gradient Boost Classifier gradBoost = GradientBoostingClassifier(random_state=0) gradBoost.fit(X_train, y_train) res = gradBoost.predict(features[test_index]) bl_smote_scores['GB'] += metrics.f1_score(res, target[test_index]) bl_smote_con_mat['GB'] += confusion_matrix(y_true=target[test_index], y_pred=res) # K-Means Smote km_smote = KMeansSMOTE(random_state=0) X_train, y_train = km_smote.fit_sample(features[train_index], target[train_index]) # unique, counts = np.unique(y_train, return_counts=True) # print("Kmeans uni, count:",np.asarray((unique, counts)).T) # Logistic Regression logistic = LogisticRegression(random_state=0) logistic.fit(X_train, y_train) res = logistic.predict(features[test_index]) km_scores['LR'] += metrics.f1_score(res, target[test_index]) km_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res) # # Ada Boost Classifier adaBoost = AdaBoostClassifier(random_state=0) adaBoost.fit(X_train, y_train)
def fscore(params_org): #print(params_org) parambk = copy.deepcopy(params_org) ifError =0 global best, HPOalg,params_best, errorcount params= params_org['classifier'] classifier = params.pop('name') p_random_state = params.pop('random_state') if (classifier == 'SVM'): param_value= params.pop('gamma_value') if(params['gamma'] == "value"): params['gamma'] = param_value else: pass clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params) #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223 #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262 elif(classifier == 'RF'): clf = RandomForestClassifier(random_state = p_random_state, **params) elif(classifier == 'KNN'): p_value = params.pop('p') if(p_value==0): params['metric'] = "chebyshev" elif(p_value==1): params['metric'] = "manhattan" elif(p_value==2): params['metric'] = "euclidean" else: params['metric'] = "minkowski" params['p'] = p_value #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302 clf = KNeighborsClassifier(**params) elif(classifier == 'DTC'): clf = DecisionTreeClassifier(random_state = p_random_state, **params) elif(classifier == 'LR'): penalty_solver = params.pop('penalty_solver') params['penalty'] = penalty_solver.split("+")[0] params['solver'] = penalty_solver.split("+")[1] clf = LogisticRegression(random_state = p_random_state, **params) #resampling parameter p_sub_params= params_org.pop('sub') p_sub_type = p_sub_params.pop('type') sampler = p_sub_params.pop('smo_grp') gmean = [] if (p_sub_type == 'SMOTE'): smo = SMOTE(**p_sub_params) elif (p_sub_type == 'ADASYN'): smo = ADASYN(**p_sub_params) elif (p_sub_type == 'BorderlineSMOTE'): smo = BorderlineSMOTE(**p_sub_params) elif (p_sub_type == 'SVMSMOTE'): smo = SVMSMOTE(**p_sub_params) elif (p_sub_type == 'SMOTENC'): smo = SMOTENC(**p_sub_params) elif (p_sub_type == 'KMeansSMOTE'): smo = KMeansSMOTE(**p_sub_params) elif (p_sub_type == 'RandomOverSampler'): smo = RandomOverSampler(**p_sub_params) #Undersampling elif (p_sub_type == 'TomekLinks'): smo = TomekLinks(**p_sub_params) elif (p_sub_type == 'ClusterCentroids'): if(p_sub_params['estimator']=='KMeans'): p_sub_params['estimator']= KMeans(random_state = p_random_state) elif(p_sub_params['estimator']=='MiniBatchKMeans'): p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state) smo = ClusterCentroids(**p_sub_params) elif (p_sub_type == 'RandomUnderSampler'): smo = RandomUnderSampler(**p_sub_params) elif (p_sub_type == 'NearMiss'): smo = NearMiss(**p_sub_params) elif (p_sub_type == 'InstanceHardnessThreshold'): if(p_sub_params['estimator']=='knn'): p_sub_params['estimator']= KNeighborsClassifier() elif(p_sub_params['estimator']=='decision-tree'): p_sub_params['estimator']=DecisionTreeClassifier() elif(p_sub_params['estimator']=='adaboost'): p_sub_params['estimator']=AdaBoostClassifier() elif(p_sub_params['estimator']=='gradient-boosting'): p_sub_params['estimator']=GradientBoostingClassifier() elif(p_sub_params['estimator']=='linear-svm'): p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC()) elif(p_sub_params['estimator']=='random-forest'): p_sub_params['estimator']=RandomForestClassifier(n_estimators=100) smo = InstanceHardnessThreshold(**p_sub_params) elif (p_sub_type == 'CondensedNearestNeighbour'): smo = CondensedNearestNeighbour(**p_sub_params) elif (p_sub_type == 'EditedNearestNeighbours'): smo = EditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'RepeatedEditedNearestNeighbours'): smo = RepeatedEditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'AllKNN'): smo = AllKNN(**p_sub_params) elif (p_sub_type == 'NeighbourhoodCleaningRule'): smo = NeighbourhoodCleaningRule(**p_sub_params) elif (p_sub_type == 'OneSidedSelection'): smo = OneSidedSelection(**p_sub_params) #Combine elif (p_sub_type == 'SMOTEENN'): smo = SMOTEENN(**p_sub_params) elif (p_sub_type == 'SMOTETomek'): smo = SMOTETomek(**p_sub_params) e='' try: for train, test in cv.split(X, y): if(p_sub_type=='NO'): X_smo_train, y_smo_train = X[train], y[train] else: X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train]) y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test]) gm = geometric_mean_score(y[test], y_test_pred, average='binary') gmean.append(gm) mean_g=np.mean(gmean) except Exception as eec: e=eec mean_g = 0 ifError =1 errorcount = errorcount+1 gm_loss = 1 - mean_g abc=time.time()-starttime if mean_g > best: best = mean_g params_best = copy.deepcopy(parambk) return {'loss': gm_loss, 'mean': mean_g, 'status': STATUS_OK, # -- store other results like this 'run_time': abc, 'iter': iid, 'current_best': best, 'eval_time': time.time(), 'SamplingGrp': sampler, 'SamplingType': p_sub_type, 'ifError': ifError, 'Error': e, 'params' : parambk, 'attachments': {'time_module': pickle.dumps(time.time)} }
# density. # %% from imblearn.over_sampling import BorderlineSMOTE, KMeansSMOTE, SVMSMOTE X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) fig, axs = plt.subplots(5, 2, figsize=(15, 30)) samplers = [ SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind="borderline-1"), BorderlineSMOTE(random_state=0, kind="borderline-2"), KMeansSMOTE(random_state=0), SVMSMOTE(random_state=0), ] for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, clf, ax[0], title=f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax[1]) fig.suptitle("Decision function and resampling using SMOTE variants") fig.tight_layout()
""" plt.figure(figsize=(20,5)) plt.title('Classes da variável Churn desbalanceadas', size=15) sns.countplot(x='Churn', data=churn2) plt.xlabel('Classes', size=15) plt.ylabel(''); """Rebalanceamento das classes com vários algoritmos de reamostragem de dados. Aqui irei aplicar model0s de *Oversampling*, de *Undersampling* e dessas duas técnicas de forma combinada.""" #Algoritmos de Oversampling X1,y1=SMOTE().fit_resample(X,y) X2,y2=ADASYN().fit_resample(X,y) X3,y3=BorderlineSMOTE().fit_resample(X,y) X4,y4=SVMSMOTE().fit_resample(X,y) X5,y5=KMeansSMOTE().fit_resample(X,y) X6,y6=SMOTEN().fit_resample(X,y) #X7,y7=SMOTENC().fit_resample(X,y) X8,y8=RandomOverSampler().fit_resample(X,y) #Algoritmos de Undersampling X9,y9=RandomUnderSampler().fit_resample(X,y) X10,y10=NearMiss().fit_resample(X,y) X11,y11=EditedNearestNeighbours().fit_resample(X,y) X12,y12=RepeatedEditedNearestNeighbours().fit_resample(X,y) X13,y13=AllKNN().fit_resample(X,y) #X14,y14=CondensedNearestNeighbour().fit_resample(X,y) X15,y15=OneSidedSelection().fit_resample(X,y) X16,y16=NeighbourhoodCleaningRule().fit_resample(X,y) X17,y17=InstanceHardnessThreshold().fit_resample(X,y)
y = data[col[-1]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) imp = SimpleImputer(strategy='mean') # 均值 单变量插补 X_train = imp.fit_transform(X_train) # 训练集插补 X_test = imp.transform(X_test) # 测试集插补 prep = StandardScaler() X_train = prep.fit_transform(X_train) X_test = prep.transform(X_test) ops_ada = ADASYN(random_state=10) ops_bsmote = BorderlineSMOTE(random_state=10) ops_ksmote = KMeansSMOTE(random_state=10) ops_rs = RandomOverSampler(random_state=10) ops_s = SMOTE(random_state=10) X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train) X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train) X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train) X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train) X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train) dic_ = { 'ADASYN': [X_train_ada, y_train_ada], 'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote], 'RandomOverSampler': [X_train_rs, y_train_rs], 'SMOTE': [X_train_s, y_train_s] }
y_vals = dataframe.iloc[:, 18:19] print(y_vals.value_counts()) pca = PCA(n_components=3) #X_train =pca.fit_transform(X_train) y_train = y_train.ravel() seed = 6 #GET CATERGORICAL FEATURES SEPARATED FROM CONTINUOUS, scale continuous, smotenc with all smote_value = 0.55 print("smote value is " + str(smote_value)) sm = KMeansSMOTE(random_state=seed, sampling_strategy=smote_value, cluster_balance_threshold=0.3) rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=17) inpt = 17 def create_model(x): def bm(): clf = Sequential() clf.add(Dense(9, activation='relu', input_dim=x)) clf.add(Dense(9, activation='relu')) clf.add(Dense(2, activation='sigmoid')) clf.compile(loss='categorical_crossentropy', optimizer=SGD()) return model return bm