def test_sample_borderline1(): """Test sample function with borderline 1 SMOTE.""" # Create the object kind = 'borderline1' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_svm(): """Test sample function with SVM SMOTE.""" # Create the object kind = 'svm' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [1.44015515, -1.30621303]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.8 kind = 'regular' smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object sm = SMOTE(random_state=RND_SEED) sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTE(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 8) assert_equal(smote.stats_c_[1], 12)
def age_prediction(df, classifier='lr'): df.dropna(subset=['age'], inplace=True) # Data to use X = df['text_p'] #age features extraction y = df['age'] # Results without oversampling and only cv - F1 macro # NB: 0.64, LR: 0.66, RF: 0.54 # using synthetic oversampling technique smote = SMOTE('minority') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=33) X_sm, y_sm = smote.fit(X_train, y_train) if classifier == 'nb': nb(X_sm, X_test, y_sm, y_test) elif classifier == 'lr': lr(X_sm, X_test, y_sm, y_test) elif classifier == 'rf': rf(X_sm, X_test, y_sm, y_test) else: sgd(X_sm, X_test, y_sm, y_test)
def split_data(self, data, seed, re=False): X, y = data.iloc[:, 1:-1], data.iloc[:, -1] # Train-Test split test_size = 0.2 X_train_o, X_test, y_train_o, y_test = model_selection.train_test_split( X, y, test_size=test_size, random_state=seed) # Resampling if re: resam = SMOTE(random_state=seed) resam.fit(X_train_o, y_train_o) X_train, y_train = resam.fit_resample(X_train_o, y_train_o) X_train = pd.DataFrame(X_train, columns=X_train_o.columns) y_train = pd.Series(y_train) else: X_train, y_train = X_train_o, y_train_o return X, y, X_train, y_train, X_test, y_test
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_svm(): """Test sample function with SVM SMOTE.""" # Create the object kind = 'svm' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_svm_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_svm_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
class SMOTER: def __init__(self, *args, **kwargs): self.smote = SMOTE(*args, **kwargs) self.params = dict() for key, value in kwargs.items(): self.params[key] = value def fit(self, X, y): self.smote.fit(X, y) return None def transform(self, X, y=None): return self.smote.sample(X, y) def get_params(self, deep): return self.params
def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.5 kind = 'regular' smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def split_data(self, data, seed, re=False): lbl = preprocessing.LabelEncoder() lbl.fit(list(data["class"].values)) data["class"] = lbl.fit_transform(list(data["class"].values)) X, y = data.iloc[:, 0:-1], data.iloc[:, -1] X = self.OnehotEncode(X, X.select_dtypes("category").columns) X.columns = [col.replace("<", "_") for col in X.columns] # Train-Test split test_size = 0.3 X_train_o, X_test, y_train_o, y_test = model_selection.train_test_split( X, y, test_size=test_size, random_state=seed) # Resampling if re: resam = SMOTE(random_state=seed) resam.fit(X_train_o, y_train_o) X_train, y_train = resam.fit_resample(X_train_o, y_train_o) X_train = pd.DataFrame(X_train, columns=X_train_o.columns) y_train = pd.Series(y_train) else: X_train, y_train = X_train_o, y_train_o return X_train, y_train, X_test, y_test
plt.legend(loc="lower right") plt.show() ## 2(3) from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report, f1_score, precision_recall_fscore_support) sample_leaf_options = [1,5,10,50,100,200,500] RF = RandomForestClassifier(min_samples_split=20, random_state=99,max_depth=(len(X_train)-1)) RF.fit(X_train, y_train) preditct_RF = RF.predict(X_test) print('accuracy using RF:',accuracy_score(preditct_RF, y_test)) sm = svm.SVC(C=5,kernel='rbf',gamma=0.02) sm.fit(X_train, y_train) preditct_sm = sm.predict(X_test) print('accuracy using sm:',accuracy_score(preditct_sm, y_test)) ### MLP mlp_clf = MLPClassifier(solver='sgd', alpha=1e-4,hidden_layer_sizes=(10,3),learning_rate='adaptive', random_state=1,activation='tanh') mlp_clf.fit(X_train, y_train) preditct_mlp = mlp_clf.predict(X_test) print('accuracy using NN:',accuracy_score(preditct_mlp, y_test)) report = classification_report(y_test,preditct_RF) fpr, tpr, thresholds = roc_curve(y_test, preditct_RF) roc_auc = auc(fpr, tpr) ## MLP model 2 model = Sequential() model.add(Dense(512, activation='relu', input_shape=(30,))) model.add(Dropout(0.2))
def _sample(self, X, y): # Create the clusters and set the labels self._set_cluster() self._fit_cluster(X, y) self.labels = self._cluster_class.labels_ X_resampled = X.copy() y_resampled = y.copy() with catch_warnings(): filterwarnings("ignore", category=UserWarning, module="imblearn") for target_class in self.ratio_: n_to_generate = self.ratio_[target_class] clusters_to_use = self._filter_clusters( y, self._cluster_class.labels_, target_class) # In case we do not have cluster where the target class it dominant, we apply regular SMOTE if not clusters_to_use and n_to_generate > 0: warn("Class does not have a cluster where is dominant.") else: sampling_weights = self._calculate_sampling_weights( X, y, clusters_to_use, self.labels, target_class) for cluster in sampling_weights: mask = self.labels == cluster X_cluster = X[mask] y_cluster = y[mask] n_obs = mask.sum() artificial_index = -1 # There needs to be at least two unique values of the target variable if np.unique(y_cluster).size < 2: art_x = np.zeros((1, X.shape[1])) artificial_index = n_obs artificial_y = np.unique(y)[ np.unique(y) != target_class][0] X_cluster = np.concatenate((X_cluster, art_x), axis=0) y_cluster = np.concatenate( (y_cluster, np.asarray(artificial_y).reshape( (1, ))), axis=0) minority_obs = y_cluster[y_cluster == target_class] n_new = n_to_generate * sampling_weights[cluster] temp_dic = { target_class: int(round(n_new) + minority_obs.size) } # We need to make sure that k_neighors is less than the number of observations in the cluster if self.k_neighbors > minority_obs.size - 1: k_neighbors = minority_obs.size - 1 else: k_neighbors = self.k_neighbors over_sampler = SMOTE(ratio=temp_dic, k_neighbors=k_neighbors) over_sampler.fit(X_cluster, y_cluster) X_cluster_resampled, y_cluster_resampled = over_sampler.sample( X_cluster, y_cluster) # If there was a observation added, then it is necessary to remove it now if artificial_index > 0: X_cluster_resampled = np.delete( X_cluster_resampled, artificial_index, axis=0) y_cluster_resampled = np.delete( y_cluster_resampled, artificial_index) # Save the newly generated samples only X_cluster_resampled = X_cluster_resampled[n_obs:, :] y_cluster_resampled = y_cluster_resampled[n_obs:, ] # Add the newly generated samples to the data to be returned X_resampled = np.concatenate( (X_resampled, X_cluster_resampled)) y_resampled = np.concatenate( (y_resampled, y_cluster_resampled)) return X_resampled, y_resampled
class SMOTEBoost(AdaBoostClassifier): """Implementation of SMOTEBoost. SMOTEBoost introduces data sampling into the AdaBoost algorithm by oversampling the minority class using SMOTE on each boosting iteration [1]. This implementation inherits methods from the scikit-learn AdaBoostClassifier class, only modifying the `fit` method. Parameters ---------- n_samples : int, optional (default=100) Number of new synthetic samples per boosting step. k_neighbors : int, optional (default=5) Number of nearest neighbors. base_estimator : object, optional (default=DecisionTreeClassifier) The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper `classes_` and `n_classes_` attributes. n_estimators : int, optional (default=50) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early. learning_rate : float, optional (default=1.) Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``. algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R') If 'SAMME.R' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If 'SAMME' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations. random_state : int or None, optional (default=None) If int, random_state is the seed used by the random number generator. If None, the random number generator is the RandomState instance used by np.random. References ---------- .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer. "SMOTEBoost: Improving Prediction of the Minority Class in Boosting." European Conference on Principles of Data Mining and Knowledge Discovery (PKDD), 2003. """ def __init__(self, n_samples=100, k_neighbors=5, #base_estimator=None, base_estimator = SVC(probability=True, kernel='linear'), n_estimators=50, learning_rate=1., #algorithm='SAMME.R', algorithm='SAMME', random_state=None): self.n_samples = n_samples self.algorithm = algorithm self.smote = SMOTE(k_neighbors=k_neighbors, random_state=random_state) super(SMOTEBoost, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state) def fit(self, X, y, sample_weight=None, minority_target=None): """Build a boosted classifier/regressor from the training set (X, y), performing SMOTE during each boosting step. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is forced to DTYPE from tree._tree if the base classifier of this ensemble weighted boosting classifier is a tree or forest. y : array-like of shape = [n_samples] The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape = [n_samples], optional Sample weights. If None, the sample weights are initialized to 1 / n_samples. minority_target : int Minority class label. Returns ------- self : object Returns self. Notes ----- Based on the scikit-learn v0.18 AdaBoostClassifier and BaseWeightBoosting `fit` methods. """ # Check that algorithm is supported. if self.algorithm not in ('SAMME', 'SAMME.R'): raise ValueError("algorithm %s is not supported" % self.algorithm) # Check parameters. if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") if (self.base_estimator is None or isinstance(self.base_estimator, (BaseDecisionTree, BaseForest))): DTYPE = np.float64 # from fast_dict.pxd dtype = DTYPE accept_sparse = 'csc' else: dtype = None accept_sparse = ['csr', 'csc'] X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype, y_numeric=is_regressor(self)) if sample_weight is None: # Initialize weights to 1 / n_samples. sample_weight = np.empty(X.shape[0], dtype=np.float64) sample_weight[:] = 1. / X.shape[0] else: sample_weight = check_array(sample_weight, ensure_2d=False) # Normalize existing weights. sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) # Check that the sample weights sum is positive. if sample_weight.sum() <= 0: raise ValueError( "Attempting to fit with a non-positive " "weighted number of samples.") if minority_target is None: # Determine the minority class label. stats_c_ = Counter(y) maj_c_ = max(stats_c_, key=stats_c_.get) min_c_ = min(stats_c_, key=stats_c_.get) self.minority_target = min_c_ else: self.minority_target = minority_target # Check parameters. self._validate_estimator() # Clear any previous fit results. self.estimators_ = [] self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) random_state = check_random_state(self.random_state) for iboost in range(self.n_estimators): # SMOTE step. # X_min = X[np.where(y == self.minority_target)] # self.smote.fit(X_min) # X_syn = self.smote.sample(self.n_samples) # y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target, dtype=np.int64) X_temp, y_temp = self.smote.fit(X, y) X_syn = X_temp[len(X):] y_syn = y_syn[len(y):] # Normalize synthetic sample weights based on current training set. sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64) sample_weight_syn[:] = 1. / X.shape[0] # Combine the original and synthetic samples. X = np.vstack((X, X_syn)) y = np.append(y, y_syn) # Combine the weights. sample_weight = \ np.append(sample_weight, sample_weight_syn).reshape(-1, 1) sample_weight = \ np.squeeze(normalize(sample_weight, axis=0, norm='l1')) # X, y, sample_weight = shuffle(X, y, sample_weight, # random_state=random_state) # Boosting step. sample_weight, estimator_weight, estimator_error = self._boost( iboost, X, y, sample_weight, random_state) # Early termination. if sample_weight is None: break self.estimator_weights_[iboost] = estimator_weight self.estimator_errors_[iboost] = estimator_error # Stop if error is zero. if estimator_error == 0: break sample_weight_sum = np.sum(sample_weight) # Stop if the sum of sample weights has become non-positive. if sample_weight_sum <= 0: break if iboost < self.n_estimators - 1: # Normalize. sample_weight /= sample_weight_sum return self
metrics=["accuracy"]) # if __name__ == '__main__': # img_path = '/data/edong/PycharmProjects/projpy/CancerOriginal/00-3734A_Thionin_Cancer_FEU_00000_1_40x.tif' # img = image.load_img(img_path, target_size=(96, 96)) # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # x = image.img_to_array(img) # x = np.expand_dims(x, axis=0) # x = preprocess_input(x) # print('Input image shape:', x.shape) # # Test a image: # preds = model.predict(x) # print('Predicted:', decode_predictions(preds)) # Fit the model with data from imblearn.over_sampling import SMOTE sm = SMOTE() histsmote = sm.fit(X_train, y_train, batch_size=64, epochs=80, verbose=1, validation_data=(X_val, y_val)) hist = model.fit(X_train, y_train, batch_size=64, epochs=80, verbose=1, validation_data=(X_val, y_val))
y_rfm = df_modeling_rfm['response'] #CLV X_clv = df_modeling_clv.drop(columns=['response','customer_id']) y_clv = df_modeling_clv['response'] ## creating train and test dataset #RFM X_train_rfm, X_test_rfm, y_train_rfm, y_test_rfm = train_test_split(X_rfm, y_rfm, test_size=0.3, random_state=0) #CLV X_train_clv, X_test_clv, y_train_clv, y_test_clv = train_test_split(X_clv, y_clv, test_size=0.3, random_state=0) # create dummy variable because data is imbalanced sm = SMOTE(random_state=0) #RFM sm.fit(X_train_rfm, y_train_rfm) X_SMOTE_rfm, y_SMOTE_rfm = sm.fit_sample(X_train_rfm, y_train_rfm) #CLV sm.fit(X_train_clv, y_train_clv) X_SMOTE_clv, y_SMOTE_clv = sm.fit_sample(X_train_clv, y_train_clv) print('logistic regression model - SMOTE RFM') logreg = LogisticRegression(solver='liblinear', class_weight='balanced') predicted_y = [] expected_y = [] logreg_model_SMOTE_rfm = logreg.fit(X_SMOTE_rfm, y_SMOTE_rfm) predictions = logreg_model_SMOTE_rfm.predict(X_SMOTE_rfm) predicted_y.extend(predictions) expected_y.extend(y_SMOTE_rfm)
def multiclass_classification(X, Y, sub_to_main_type, feature_names, isSubType, samplingMethod): """ This function is for multi-class classification with some sampling methods. :param X: numpy array for features. :param Y: numpy array for class labels. :param sub_to_main_type: dict mapping network sub-type to network type. :param feature_names: a list of feature names. :param isSubType: flag for if labels in Y are network subtypes or not. :param samplingMethod: name of the sampling method. Valid names are: RandomOver, RandomUnder, SMOTE and None :return: cm: confusion matrix NetworkTypeLabels: a list of string, either network type or network subtype. accuracy: accuracy value taking a value in the range [0-1]. feature_importances: a list of tuple of a feature's name and its importance in the classification. """ if isSubType: NetworkTypeLabels = sorted( list(set(Y)), key=lambda sub_type: sub_to_main_type[sub_type]) else: NetworkTypeLabels = sorted(list(set(Y))) sss = StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=0) for train_index, test_index in sss.split(X, Y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] if samplingMethod == "RandomOver": random_over = RandomOverSampler() sampled_x, sampled_y = random_over.fit_sample(X_train, y_train) elif samplingMethod == "RandomUnder": random_under = RandomUnderSampler() sampled_x, sampled_y = random_under.fit_sample(X_train, y_train) # SMOTE does not support multi-class classification in imblearn library, so we populate minority classes # in binary classification setting. The resulting set should all have the same # of instances as the largest class. elif samplingMethod == "SMOTE": sm = SMOTE(kind='regular', k=3) sm.fit(X_train, y_train) # get the label of the largest class in terms of the number of instances. majority = sm.maj_c_ all_X = [] all_Y = [] for network_type in NetworkTypeLabels: if network_type != majority: # extract elements of a pair of network types, i.e. the majority and one to be inflated X_extracted = np.concatenate( (X_train[y_train == majority], X_train[y_train == network_type]), axis=0) Y_extracted = np.concatenate( (y_train[y_train == majority], y_train[y_train == network_type]), axis=0) x_tmp, y_tmp = sm.fit_sample(X_extracted, Y_extracted) x = x_tmp[y_tmp == network_type] y = y_tmp[y_tmp == network_type] all_X.append(x) all_Y.append(y) all_X.append(X_train[y_train == majority]) all_Y.append(y_train[y_train == majority]) Xs = np.concatenate(tuple(all_X)) Ys = np.concatenate(tuple(all_Y)) sampled_x, sampled_y = sm.fit_sample(Xs, Ys) elif samplingMethod == "None": sampled_x, sampled_y = X_train, y_train random_forest = RandomForestClassifier() random_forest.fit(sampled_x, sampled_y) accuracy = random_forest.score(X_test, y_test) feature_importances = sorted(zip( map(lambda x: round(x, 4), random_forest.feature_importances_), feature_names), reverse=True) y_pred = random_forest.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels=NetworkTypeLabels) return cm, NetworkTypeLabels, accuracy, feature_importances
acc = do_cross_val_LR(np.array(X_resampled), y_resampled, 10) print("Accuracy", acc) ##Random Forest acc = do_cross_val_RForest(np.array(X_resampled), y_resampled, 10) print("Accuracy", acc) ##Balancing by SMOTE from imblearn.over_sampling import SMOTE print('Original dataset shape {}'.format(Counter(y))) sm = SMOTE(random_state=0) X_res, y_res = sm.fit_sample(X, y) print('Resampled dataset shape {}'.format(Counter(y_res))) from imblearn.over_sampling import SMOTE print('Original dataset shape {}'.format(Counter(y))) sm = SMOTE(random_state=42) sm.fit(X, y) X_res, y_res = sm.sample(X, y) print('Resampled dataset shape {}'.format(Counter(y_res))) X_res1, y_res1 = sm.fit_sample(X_res, y_res) print('Resampled dataset shape {}'.format(Counter(y_res1))) X_res2, y_res2 = sm.fit_sample(X_res1, y_res1) print('Resampled dataset shape {}'.format(Counter(y_res2))) ## Decision Tree acc = do_cross_val_Decision(X_res2, y_res2, 10) print("Accuracy", acc) ## Logistic Regression acc = do_cross_val_LR(X_res2, y_res2, 10) print("Accuracy", acc) ## Random Forest acc = do_cross_val_RForest(X_res2, y_res2, 10)
lcol_num = [x for x in dtype.index.values if not(x in list_col_cat)] for i in lcol_num: colname.append("{}".format(i)) X_train = pipeline_preprocess.transform(X_train) X_test = pipeline_preprocess.transform(X_test) joblib.dump([dtype,categorical_feat_classes,list_col_cat,list_idx_cat,categorical_onehot_idx,categorical_onehot_nval,colname], './model/las_kupedes_ultramikro_v3_var.sav') #joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess.sav') joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess_wo_scaler.sav') ### Resampling unbalanced dataset # (1) Over-sampling with SMOTE def_ratio = 0.15 sm = SMOTE(random_state=42, ratio={0:Y_train.value_counts()[0],1:int(Y_train.value_counts()[0]*(def_ratio/(1-def_ratio)))}) sm.fit(X_train,Y_train) X_train_upsampled, Y_train_upsampled = sm.sample(X_train, Y_train) # (2) Class weight from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight sample_weight = compute_sample_weight( class_weight = {0:1,1:10}, y = Y_train_upsampled ) ### CV - XGBoost from sklearn.model_selection import KFold K = 5 kf = KFold(n_splits = K, random_state = 3228, shuffle = True) xgb_preds = []
from imblearn.over_sampling import SMOTE # Generate a global dataset to use RND_SEED = 0 book = xlrd.open_workbook("F:/Dot/Downloads/truncated.xls") sheet = book.sheet_by_index(0) X = [] for row in range(sheet.nrows): _row = [] for col in range(sheet.ncols): _row.append(sheet.cell_value(row, col)) X.append(_row) X = np.asmatrix(X) Y = [] X = X.transpose() for i in range(96): if i < 87: Y.append(2) else: Y.append(1) R_TOL = 1e-4 kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) print(X_resampled)
model = LogisticRegression(random_state=6) model.fit(X_train, y_train) y_pred = model.predict(X_test) score = accuracy_score(y_test, y_pred) print("Accuracy score:", score) # Code ends here # -------------- from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE # code starts here smote = SMOTE(random_state=9) smote.fit(X_train, y_train) X_train, y_train = smote.fit_sample(X_train, y_train) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # Code ends here # -------------- # Code Starts here model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) score = accuracy_score(y_test, y_pred)
# Find intersecting features avail_columns = compound_x.columns.intersection(full_columns) # Select features on subset x_data = compound_x.loc[:, avail_columns] y_data = compound_y.copy() # Create binary variable y_class = np.squeeze([int(y_val <= 10) for y_val in y_data]) # Smote from custom_pipe_helper import SMOTER import auto smote = SMOTE() check = smote.fit(x_data, y_class) smote.fit_sample() check = smote.sample(x_data, y_class) check[0].shape check[1] # Create folds # For each fold # SMOTE the train data # Train model # Evaluate model from sklearn.ensemble import AdaBoostClassifier from imblearn.over_sampling import SMOTE from sklearn.model_selection import StratifiedKFold