def oversample(self): self._X_original = self._X self._y_original = self._y ros = RandomOverSampler(random_state=0) X, y = ros.fit_sample(self._X, self._y) self._X = X self._y = y
def _fit_resample(self, X, y): n_samples = X.shape[0] # convert y to z_score y_z = (y - y.mean()) / y.std() index0 = np.arange(n_samples) index_negative = index0[y_z > self.negative_thres] index_positive = index0[y_z <= self.positive_thres] index_unclassified = [x for x in index0 if x not in index_negative and x not in index_positive] y_z[index_negative] = 0 y_z[index_positive] = 1 y_z[index_unclassified] = -1 ros = RandomOverSampler( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) _, _ = ros.fit_resample(X, y_z) sample_indices = ros.sample_indices_ print("Before sampler: %s. Total after: %s" % (Counter(y_z), sample_indices.shape)) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices))
def transform(self, X, y=None): # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the # TODO simple trainer in the correct order and leave this to advanced users? # Extract predicted column y = np.squeeze(X[[self.predicted_column]]) # Copy the dataframe without the predicted column temp_dataframe = X.drop([self.predicted_column], axis=1) # Initialize and fit the under sampler over_sampler = RandomOverSampler(random_state=self.random_seed) x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y) # Build the resulting under sampled dataframe result = pd.DataFrame(x_over_sampled) # Restore the column names result.columns = temp_dataframe.columns # Restore the y values y_over_sampled = pd.Series(y_over_sampled) result[self.predicted_column] = y_over_sampled return result
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ros = RandomOverSampler(random_state=RND_SEED) ros.fit(X, Y) assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5
def test_random_over_sampling_heterogeneous_data(): X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], dtype=np.object) y = np.array([0, 0, 1]) ros = RandomOverSampler(random_state=RND_SEED) X_res, y_res = ros.fit_resample(X_hetero, y) assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 assert X_res.dtype == object assert X_res[-1, 0] in X_hetero[:, 0]
def test_ros_fit_sample(): """Test the fit sample routine""" # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ros_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ros_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: ros = RandomOverSampler() #ros = SMOTE() #ros = ADASYN() self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_ros_fit(): """Test the fitting method""" # Create the object ros = RandomOverSampler(random_state=RND_SEED) # Fit the data ros.fit(X, Y) # Check if the data information have been computed assert_equal(ros.min_c_, 0) assert_equal(ros.maj_c_, 1) assert_equal(ros.stats_c_[0], 3) assert_equal(ros.stats_c_[1], 7)
def resample(X, y, sample_fraction=0.1, test_size=0.3): X_columns = X.columns y_columns = y.columns n = len(X_columns) print('~' * 80) print('@@-\n', y.converted.value_counts()) print('@@0 - Original') show_balance(y.values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) print('@@2 - y_train') show_balance(y_train) print('@@2 - y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n ros = RandomOverSampler(random_state=42) X_train, y_train = ros.fit_sample(X_train, y_train) X_test, y_test = ros.fit_sample(X_test, y_test) print('@@3 - Oversampled y_train') show_balance(y_train) print('@@3 - Oversampled y_test') show_balance(y_test) assert X_train.shape[1] == n and X_test.shape[1] == n if sample_fraction < 1.0: _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43) _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44) print('@@2 - Downsampled y_train') show_balance(y_train) print('@@2 - Downsampled y_test') show_balance(y_test) assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape) assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape) print('X_columns=%d %s' % (len(X_columns), X_columns)) print('y_columns=%d %s' % (len(y_columns), y_columns)) print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape))) print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape))) assert X_train.shape[1] == n and X_test.shape[1] == n X_train = pd.DataFrame(X_train, columns=X_columns) y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index) X_test = pd.DataFrame(X_test, columns=X_columns) y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index) print('@@+ y_train\n', y_train.converted.value_counts(), flush=True) print('@@+ y_test\n', y_test.converted.value_counts(), flush=True) return (X_train, y_train), (X_test, y_test)
def test_ros_fit_resample_half(): sampling_strategy = {0: 3, 1: 7} ros = RandomOverSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def oversample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: ros = RandomOverSampler() # to fix the random sampling seed at a certain value & return indices: #ros = RandomOverSampler(random_state=2) self.X, self.y = ros.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_random_over_sampling_return_indices(): ros = RandomOverSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, sample_indices = ros.fit_resample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [ 0.20792588, 1.49407907 ], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [ 0.15490546, 0.3130677 ], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.13347175, 0.12167502], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 3600) assert_equal(count_y_res[1], 3600) assert_equal(count_y_res[2], 3600)
def test_ros_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.5 ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907], [0.22950086, 0.33367433], [0.15490546, 0.3130677], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982], [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502]]) y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
"voicemailplan"]] = X[["internationalplan", "voicemailplan"]].replace({ "yes": 1, "no": 0 }) X.head() sns.set() plt.figure(figsize=(6, 6)) sns.countplot(y, palette="plasma") plt.show() from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=42) X_ros, y_ros = ros.fit_sample(X, y) temp = pd.Series(y_ros) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_ros, temp, random_state=42, test_size=42, stratify=temp) # Fitting the Logistic Model y_train.unique() from sklearn.linear_model import LogisticRegression
test_size=0.2, random_state=43) # TF-IDF Vectors as features # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) tfidf_vect.fit(x_train) x_train_tfidf = tfidf_vect.transform(x_train) x_test_tfidf = tfidf_vect.transform(x_test) x_train_tfidf_os_all = [] # os = oversample y_train_tfidf_os_all = [] for i in range(6): sm_tfidf = RandomOverSampler(random_state=40) x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:, i]) x_train_tfidf_os_all.append(x_train_tfidf_os) y_train_tfidf_os_all.append(y_train_tfidf_os) # svm probs rf_predict_proba_train = [] rf_predict_proba_test = [] for i in range(6): # Linear SVM with grid search param_grid = {'n_estimators': [500, 750, 1000], 'max_features': [2, 4, 6, 8],
from imblearn.over_sampling import SMOTE SEED=0xDEADBEEF y_col = 'add' X_cols = ['pct_contrib','turnover','VWAP','vol','VWMC','SPTSXComp'] all_cols = X_cols + [y_col] X = filtered[X_cols] y = filtered[y_col] X_test, X_train, y_test, y_train = sk.model_selection.train_test_split(X.values, y.values, test_size=0.2, random_state=SEED) filtered[all_cols].to_sql('model_inputs', conn, if_exists='replace', index=False) #oversampler = SMOTE(random_state=SEED) oversampler = RandomOverSampler(random_state=SEED) X_train_resample, y_train_resamle = oversampler.fit_resample(X_train, y_train) print(len(X_train), len(X_test)) #log_clf = LogisticRegression()# #log_clf = RandomForestClassifier() #log_clf = xgb.XGBClassifier(max_depth=4, min_child_weight=50, learning_rate=0.01, n_estimators=50, gamma=1) log_clf = svm.LinearSVC() ##LogisticRegression() log_clf.fit(X_train_resample, y_train_resamle) print(log_clf.score(X_train, y_train)) y_pred = log_clf.predict(X_test)
rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an over-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` has a ``str`` # ................................... # # ``sampling_strategy`` can be given as a string which specify the class # targeted by the resampling. With under- and over-sampling, the number of # samples will be equalized. #
evaluate_test=[] prenum_train=[] prenum_test=[] skf=StratifiedKFold(n_splits=10) for train,test in skf.split(dataMat,labelMat): #============================================================================== # skf=StratifiedShuffleSplit(n_splits=10) # for train,test in skf.split(dataMat,labelMat): #============================================================================== print("%s %s" % (train,test)) train_in=dataMat[train] test_in=dataMat[test] train_out=labelMat[train] test_out=labelMat[test] train_in, train_out = RandomOverSampler().fit_sample(train_in, train_out) trainWeights=LR.stocGradAscent1(train_in,train_out,500) len_train=np.shape(train_in)[0] len_test=np.shape(test_in)[0] test_predict=[] proba_test=[] for i in range(len_test): test_predict_tmp=LR.classifyVector(test_in[i,:], trainWeights) test_predict.append(test_predict_tmp) proba_test_tmp=LR.classifyProb(test_in[i,:], trainWeights) proba_test.append(proba_test_tmp) train_predict=[] proba_train=[]
# In[22]: #Sampling from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler #Under Sampling us = RandomUnderSampler() X_dn, y_dn = us.fit_sample(X_train_tr, y_train) X_test_dn, y_test_dn = us.fit_sample(X_test_tr, y_test) forest_clf_dn.fit(X_dn, y_dn) xgb_clf_dn.fit(X_dn, y_dn) #OverSampling os = RandomOverSampler() X_up, y_up = os.fit_sample(X_train_tr, y_train) X_test_up, y_test_up = os.fit_sample(X_test_tr, y_test) forest_clf_os.fit(X_up, y_up) xgb_clf_os.fit(X_up, y_up) # In[23]: #Forest Undersampled print('Forest Undersampled') evaluate(forest_clf_dn, X_test_dn, y_test_dn) plot_roc(forest_clf_dn, X_test_dn, y_test_dn) #Forest Oversampled print('Forest Oversampled')
"MemberType": { "open loan - dl only": 0, "closed loan - never converted": 1, "converted member": 2 } } wmdat = pd.read_csv("dat_targ.csv") class_names = wmdat["MemberType"].unique() wmdat.replace(names, inplace=True) wm_targ = wmdat["MemberType"] wmdat = wmdat.drop("MemberType", axis=1) wm, wm_test, wm_targets, wm_test_targets = train_test_split(wmdat, wm_targ) ros = RandomOverSampler(random_state=0) wm_resampled, wm_targets_resampled = ros.fit_resample(wm, wm_targets) #%% classifier = GaussianNB() classifier.fit(wm_resampled, wm_targets_resampled) targets_predicted = classifier.predict(wm_test) #print(targets_predicted) acc = sklearn.metrics.accuracy_score(wm_test_targets, targets_predicted) rec = sklearn.metrics.recall_score(wm_test_targets, targets_predicted, average='macro') prec = sklearn.metrics.precision_score(wm_test_targets, targets_predicted, average='macro') print("Accuracy : {} \n Recall : {} \n Precision : {}".format(acc, rec, prec))
# balance the class distribution) # Random Oversampling from collections import Counter from sklearn.datasets import make_classification from imblearn.over_sampling import RandomOverSampler # define dataset X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0) # summarize class distribution print(Counter(y)) # define oversampling strategy oversample = RandomOverSampler(sampling_strategy='minority') # fit and apply the transform X_over, y_over = oversample.fit_resample(X, y) # summarize class distribution print(Counter(y_over)) #The model is evaluated using repeated 10-fold cross-validation with three #repeats, and the oversampling is performed on the training dataset within # each fold separately, # Evaluating a decision tree on an imbalanced dataset with a 1:100 class # distribution # Template to test oversampling with ur dataset and learning algorithm
columns=['Class', 'Email']) # Generate test set testSet.to_csv(r'data/testSet.csv', index=None, header=True) testSetDistribution = testSet.groupby('Class').size() testSetDistribution.sort_values(ascending=False, inplace=True) print("Test class distribution: \n" + str(testSetDistribution)) # Generate training set trainingSet.to_csv(r'data/trainingSet.csv', index=None, header=True) trainingSetDistribution = trainingSet.groupby('Class').size() trainingSetDistribution.sort_values(ascending=False, inplace=True) print("Training class distribution: \n" + str(trainingSetDistribution)) # Balanced dataset - randomly over-sample smaller classes until all classes are equally represented ros = RandomOverSampler(random_state=42) X_resampled, y_resampled = ros.fit_resample( np.array(X_train).reshape(-1, 1), y_train) balancedTrainingSet = pd.DataFrame(list(zip(y_resampled, X_resampled)), columns=['Class', 'Email']) balancedTrainingSet.to_csv(r'data/trainingSet_balanced.csv', index=None, header=True) balancedTrainingSetDistribution = balancedTrainingSet.groupby('Class').size() balancedTrainingSetDistribution.sort_values(ascending=False, inplace=True) print("Balanced class distribution: \n" + str(balancedTrainingSetDistribution)) #Augmented dataset currentPath = os.getcwd() os.chdir('EDA/code') os.system(
scaler.fit(df) df = pd.DataFrame(scaler.transform(df), columns=df.columns) df["nb_years"] = v return df train = pd.read_csv('train.csv', header=0, sep=",") test = pd.read_csv('test.csv', header=0, sep=",") target = train["status_group"] piv_train = train.shape[0] # Resample # Over-sampling ros = RandomOverSampler(random_state=0) train_resampled, target = ros.fit_resample(train.drop('status_group', axis=1), target) piv_train = train_resampled.shape[0] df_all = pd.concat((train_resampled, test), axis=0, ignore_index=True) features_mask = [ "amount_tsh", "funder", "gps_height", 'installer', 'num_private', "basin", 'wpt_name', 'construction_year', "permit", "extraction_type", "payment", "quantity", "source_class" ] df = df_all ## Model 1: 3 classes df = data_preparation(df)
# df2 = df["coordinates"].apply(lambda x: pd.Series(x, dtype=np.float32)) # df2 = df2.rename(columns= {0: 'lat', 1:'lon'}) X = X.drop([".geo"], axis=1) # X["lat"] = df2['lat'].round(decimals=6 ) # X['lon'] = df2["lon"].round(decimals=6 ) if minimun > 25: print("normal smote") pipeline = SMOTE() X, y = pipeline.fit_resample(X, y) else: print("hibrid smote") over = SMOTE() random = RandomOverSampler(sampling_strategy='minority') steps = [('r', random), ('o', over)] pipeline = Pipeline(steps=steps) X, y = pipeline.fit_resample(X, y) counter = Counter(y) print("Frecuencia das classes SMOTE \n", counter) # print(X.columns) X['class'] = y # print(X.columns) lsDF.append(X.copy()) dfROIs = pd.concat(lsDF) dfROIs['system:index'] = dfROIs.index
print("Recall = " ,recall_score(Y_test, Y_pred_tomek)) confusion_matrix(Y_test, Y_pred_tomek) # #### Tomek Undersampling doesn't seem a good fit for data. There is hardly any increase in recall compared to the vanilla model. Undersampling techniques, even if they provide an increase in the metric of choice, are not favoured since you tend to lose some information when you undersample the majority class of the target. Hence in most cases, what we prefer to perform are Oversampling techniques like Random Oversampling and SMOTE # In[203]: from imblearn.over_sampling import RandomOverSampler #Code starts here #Initialise the random over sampler object ros = RandomOverSampler(random_state=0) #Sample the train data using random over sampling method X_sample_2, y_sample_2 = ros.fit_sample(X_train, Y_train) # Using a countplot sns.countplot(y_sample_2) #Initialising a logsitic regression model model_ros = LogisticRegression() #Fitting the model with train data model_ros.fit(X_sample_2, y_sample_2) #Making predictions of the train data Y_prediction=model_ros.predict(X_test)
#from sklearn import tree from imblearn.over_sampling import RandomOverSampler df = pd.read_csv('datapagitanpa2019.csv', sep=',') df2 = pd.read_csv('datapagi2019.csv', sep=',') data_array = df.values data_array2 = df2.values X_train = data_array[:, 1:40] y_train = data_array[:, 40] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0) X_test = data_array2[:, 1:40] y_test = data_array2[:, 40] ros = RandomOverSampler(random_state=42) ambilnama = df.drop(columns='tanggal') ambilnama = ambilnama.drop(columns='ww') feature_names = list(ambilnama.columns) X_train = X_train.astype(float) X_test = X_test.astype(float) y_train = y_train.astype(float) y_test = y_test.astype(float) X_res, y_res = ros.fit_sample(X_train, y_train) X_res = X_res.astype(float) y_res = y_res.astype(float)
if __name__ == '__main__': train_flag = 1 train = get_data(params.train_path + '/234' + params.b_train_file_name) test = get_data(params.train_path + '/456' + params.b_train_file_name) predicted = get_data(params.train_path + '/567' + params.b_train_file_name) train = train.drop(cols, axis=1) test = test.drop(cols, axis=1) # train = util.get_undersample_data(train) # print('undersample', train.shape) (X_train, y_train), (X_test, y_test) = get_X_y(train), get_X_y(test) print(X_train.shape, y_train.shape) X_train, y_train = RandomOverSampler('minority').fit_sample( X_train, y_train) print(X_train.shape, y_train.shape) fs_model = feature_selection(X_train, y_train) print(X_train.shape) X_train = fs_model.transform(X_train) X_test = fs_model.transform(X_test) X_predict = fs_model.transform(predicted.drop(cols, axis=1)) print(X_train.shape) if train_flag: models, names = get_models() estimators = train_predict(models, names, X_train, y_train, X_test, y_test) for estimator, name in zip(estimators, names): util.save_to_file(predicted[cols], estimator.predict(X_predict),
excl_targ = {'TCGA annotation', 'SURV', 'CNV'} tmp = excl_targ.remove(t) df = df.drop(columns=excl_targ) classes = df[t] header = df.columns df1 = df.copy(deep=True) # contains target classes df = df.drop(columns=t) # doesn't contain target classes data = np.array(df).astype(np.float) data = RobustScaler().fit_transform(data) new_data, orig_data, new_classes, orig_classes = train_test_split( data, classes, test_size=0.3) ros = RandomOverSampler() data, classes = ros.fit_sample(new_data, new_classes) # Random forests (MetOncoFit) feat = (data.shape[1] - 10) while (feat < data.shape[1] - 1): trees = 5 while (trees <= 500): rfc = RandomForestClassifier(n_estimators=trees, max_features=feat) rfc.fit(data, classes) trees = trees + 1500 feat = feat + 20 rfc_pred = rfc.predict(orig_data) mean_acc = rfc.score(orig_data, orig_classes)
import sys, os, csv from imblearn.over_sampling import RandomOverSampler input_csv_file = sys.argv[1] input_csv = input_csv_file.split(".csv")[0] with open(input_csv_file, newline="") as input_file: reader = csv.reader(input_file, delimiter=',') with open(input_csv + "-ro-.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, delimiter=',') skip_header = True X = [] y = [] ros = RandomOverSampler() for x in reader: if skip_header: skip_header = False continue y.append(x[-1]) X.append(list(map(int, x[:len(x) - 1]))) #print (X) X_res, y_res = ros.fit_sample(X, y) print (len(X_res)) print (len(y_res)) for idx, s in enumerate(X_res): #print (list(s) + list(y_res[idx])) writer.writerow(list(s) + list(y_res[idx])) #break;
bagging_fraction = 1 verbosity = 20 num_boost_round = 20000 verbose_eval = 1000 early_stopping_rounds = 200 reg_alpha = 2 reg_lambda = 15 reduction_rate = [] for random_state in range(0, 15): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=random_state) #如果 random_state = None (默认值),会随机选择一个种子,这样每次都会得到不同的数据划分。给 random_state 设置相同的值,那么当别人重新运行你的代码的时候就能得到完全一样的结果,复现和你一样的过程。 X_base_train, X_base_test, y_base_train, y_base_test = train_test_split( X_base, y_base, test_size=0.30, random_state=random_state) ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_resample(X_train, y_train) X_base_train, y_base_train = ros.fit_resample(X_base_train, y_base_train) #min_max_scaler = MinMaxScaler() #X_train = min_max_scaler.fit_transform(X_train) #X_test = min_max_scaler.fit_transform(X_test) #sc = StandardScaler() #X_train = sc.fit_transform(X_train) #X_test = sc.fit_transform(X_test) #converting the dataset into proper LGB format train_matrix = lgb.Dataset(X_train, label=y_train) valid_matrix = lgb.Dataset(X_test, label=y_test) train_matrix_base = lgb.Dataset(X_base_train, label=y_base_train) valid_matrix_base = lgb.Dataset(X_base_test, label=y_base_test)
def __init__(self): self.clf = make_pipeline_imb( Imputer(strategy='median'), RandomOverSampler(), LogisticRegression(C=0.010826367338740546, penalty="l2"))
# Y_indices = [index for index in tag_to_index_map[Y_tags]] X = np.array(data[0::5]) # X = np.random.shuffle(X) max_len = get_max_length(X) prob_test_matrix = prob_test_matrix(data, max_len) #format the input of the model X_train_indices = sentences_to_indices(X, word_to_index, max_len) Y_train = to_categorical(Y_indices) #balance the training set ros = RandomOverSampler( random_state=0) #repeat all tags to the same #of the largest tags # ros = RandomUnderSampler(replacement=True, random_state=0) #Reduce the size of largest tags #shuflle index = [i for i in range(len(X_train_indices))] random.shuffle(index) prob_test_matrix = np.array([prob_test_matrix[i] for i in index]) X_train_indices = np.array([X_train_indices[i] for i in index]) Y_train = np.array([Y_train[i] for i in index]) #split into train and test X_train = X_train_indices[:int(0.8 * len(X_train_indices))] X_test = X_train_indices[int(0.8 * len(X_train_indices)):] Y_test = Y_train[int(0.8 * len(X_train_indices)):] Y_train = Y_train[:int(0.8 * len(X_train_indices))] prob_test_matrix_train = prob_test_matrix[:int(0.8 * len(X_train_indices))]
def balance_data(X, y): # Apply the random over-sampling ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_sample(X, y) return X_resampled, y_resampled
X_average_train = X_add_train / 2 X_concat_test = hstack((X_Q_te, X_A_te)) X_add_test = X_Q_te + X_A_te X_average_test = X_add_test / 2 #print(X_concat_test) #print(y_tr) y_add_tr = y_tr.copy() y_avg_tr = y_tr.copy() #to get equal zeroes and ones in order for the machine to actually learn well ros = RandomOverSampler(random_state=42) #print(X_concat_train.shape, X_add_train.shape) X_concat_train, y_tr = ros.fit_resample(X_concat_train, y_tr) X_add_train, y_add_tr = ros.fit_resample(X_add_train, y_add_tr) X_average_train, y_avg_tr = ros.fit_resample(X_average_train, y_avg_tr) print("Done some other stuff") #using lbgfs train_vals = X_concat_train, X_add_train, X_average_train, y_tr, y_add_tr, y_avg_tr test_vals = X_concat_test, X_add_test, X_average_test, y_te, y_te, y_te dump(train_vals, train_values) dump(test_vals, test_values) clf = LogisticRegression(random_state=42,
dataset.dropna(inplace=True) # summarize the number of rows and columns in the dataset after listwise drop (sample, vnum) = dataset.shape print(sample, vnum) # Get the number of variables vnum = vnum - 1 # splice into IVs and DV values = dataset.values X = values[:, 0:vnum] y = values[:, vnum] # Oversampling ros = RandomOverSampler(random_state=0) X_R, y_R = ros.fit_sample(X, y) # create model model = Sequential() model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu')) model.add(Dense(8, kernel_initializer='uniform', activation='relu')) model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit the model model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2) # calculate predictions predictions = model.predict(X) # round predictions
scores = [] for feature in list(data.columns): # onehot encode the feature feature_data = data[[feature]] encoded_feature_data = pd.get_dummies(feature_data) print '\n' print feature print feature_data.shape print encoded_feature_data.shape print y.shape # upsample minority class from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(ratio=0.5) X_resampled, y_resampled = ros.fit_sample(encoded_feature_data, y) print '\n' print X_resampled.shape print y_resampled.shape # create train and test split X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0, test_size=0.2) print '\n' print 'Training data' print X_train.shape print y_train.shape print 'Testing data'
#scaler = StandardScaler() #X = scaler.fit_transform(X) df = pd.read_csv('../Data/validation.csv', header=0) X_valid = df.iloc[:,0:-1].copy() Y_valid = df.iloc[:, -1].copy() #scaler = StandardScaler() #X_valid = scaler.fit_transform(X_valid) # Handle the dataset with undersampling strategy rus = RandomUnderSampler(sampling_strategy=0.8) X_res, Y_res = rus.fit_resample(X, Y) # Handle the dataset with oversampling strategy ros = RandomOverSampler(random_state=0) X_resampled, Y_resampled = ros.fit_resample(X, Y) # Handle the dataset with SMOTE SM = SMOTE(random_state=0) X_smote, Y_smote = SM.fit_sample(X, Y) score_infor = [[],[],[],[]] roc_auc_score_infor = [[],[],[],[]] f1_score_infor = [[],[],[],[]] #print(pd.value_counts(Y_smote)) for weight_percent in range(1, 100): class_weight = {0: weight_percent, 1: (100-weight_percent)}
def random_over_sampler(X, y): ros = RandomOverSampler(random_state=42) X_res, y_res = ros.fit_resample(X, y) return X_res, y_res
def random_oversampling(feature_data, feature_label, random_state): X_resampled, y_resampled = \ RandomOverSampler(random_state = random_state).fit_resample(feature_data, feature_label) return X_resampled, y_resampled
count = 0 accuracy = np.zeros(split_num) loss = np.zeros(split_num) aauc = np.zeros(split_num) skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True) for train_pick, test_pick in skf.split(features, label): X_all = features X_test = features[test_pick, :] X_train = features[train_pick, :] y_all = label y_test = label[test_pick] y_train = label[train_pick] if IF_SMOTE: ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_sample(X_train, y_train) X_test, y_test = ros.fit_sample(X_test, y_test) # data pre-processing y_train = np_utils.to_categorical(y_train, num_classes=labelNum) y_test = np_utils.to_categorical(y_test, num_classes=labelNum) y_all = np_utils.to_categorical(y_all, num_classes=labelNum) # We add metrics to get more results you want to see model.compile(optimizer=optim, loss='binary_crossentropy', metrics=['accuracy']) print('Training ------------') # Another way to train the model
= train_test_split(X, y, test_size=0.20, random_state=42) # Split train_val data into training set and validation set X_train, X_val, y_train, y_val \ = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # ========================================================================================== # Over-sampled data # Generate the new dataset using under-sampling method verbose = False ratio = 'auto' # 'Random over-sampling' OS = RandomOverSampler(ratio=ratio, verbose=verbose) X_train_os, y_train_os = OS.fit_sample(X_train, y_train) # 'SMOTE' smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular') X_train_smo, y_train_smo = smote.fit_sample(X_train, y_train) # 'SMOTE bordeline 1' bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1') X_train_bs1, y_train_bs1 = bsmote1.fit_sample(X_train, y_train) # 'SMOTE bordeline 2' bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2') X_train_bs2, y_train_bs2 = bsmote2.fit_sample(X_train, y_train) # 'SMOTE SVM'
file = open("CrashSCDect_perforance_results.csv","w") file.write("classifier,f1,auc\n") def store_results(method_name:str, f1_resuts, auc_results): print("%s performance: f1:%s auc:%s" % (method_name,np.mean(f1_resuts), np.mean(auc_results))) for i in range(len(f1_resuts)): file.write(method_name+","+str(f1_resuts[i])+","+str(auc_results[i])+"\n") folds =0 for folds in range(10): print("Folds: %s"%(folds)) # sampled = RandomOverSampler(random_state=folds) X_sampled,Y_sampled = sampled.fit_resample(pydata.drop(['label'], axis=1), pydata['label']) print(X_sampled.shape) print(Y_sampled.shape) strKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=folds) scoring = ['roc_auc', 'f1'] rf = RandomForestClassifier() results_rf = cross_validate( rf, X=X_sampled, y=Y_sampled, cv=strKFold,
from imblearn.under_sampling import RandomUnderSampler, NearMiss from csgo_cheater_detection.config.config import data_path, seed, random_state # load data df = pd.read_csv(f'{data_path}\\csgo_cheater_data_8_30_20_full.csv') # Split X and y y = df.pop('label') X = df # set seeds np.random.seed(seed) # Random Over-Sampling ros = RandomOverSampler(random_state=random_state) X_temp, y_temp = ros.fit_resample(X, y) X_temp['label'] = y_temp print(X_temp.columns) # parameters sampling_methods = { 'random_over': RandomOverSampler(), 'SMOTE': SMOTE(), 'random_under': RandomUnderSampler(), 'near_miss': NearMiss() } # loop for name, method in sampling_methods.items():
rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}'.format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an over-sampling method: \n ' 'sampling_strategy={} \n y: {}'.format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # ``sampling_strategy`` has a ``str`` # ................................... # # ``sampling_strategy`` can be given as a string which specify the class # targeted by the resampling. With under- and over-sampling, the number of # samples will be equalized. #
plt.tight_layout() plt.show() # In[21]: assert Counter(y_rus)[1] == Counter(y_train)[ 1] #Checking if they have the same number of fraud cases # <a id='ros'></a> # ## Random oversampling # In[22]: ros = RandomOverSampler(sampling_strategy='auto', random_state=42) X_ros, y_ros = ros.fit_resample(X_train, y_train) # Checking If classes are balanced: # In[23]: plt.bar(['Non-Fraud', 'Fraud'], [Counter(y_ros)[0], Counter(y_ros)[1]], color=['b', 'r']) plt.xlabel('Class') plt.ylabel('Number of transactions') plt.annotate('{}'.format(Counter(y_ros)[0]), (0.20, 0.45), xycoords='axes fraction') plt.annotate('{}'.format(Counter(y_ros)[1]), (0.70, 0.45), xycoords='axes fraction')
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
def use_parameters(self, X_train, selected_features): ''' Returns ------- ''' test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] ### XGBOOST parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__objective': ['logloss'], 'model__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5], # so called `eta` value 'model__max_depth': [3, 4, 5], 'model__min_child_weight': [1, 5, 11, 12, 15], 'model__silent': [0], 'model__subsample': [0.6, 0.8, 1.0], 'model__colsample_bytree': [0.6, 0.8, 1.0], 'model__n_estimators': [5, 50, 100], # number of trees, change it to 1000 for better results 'model__missing': [-999], 'model__gamma': [0.5, 1, 1.5, 2, 5], 'model__seed': [1337] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) ### XGBOOST return parameters
def load_features(task): log_file = log_dir + 'loading_task_' + str(task['pref_id']) + '.txt' load_logger = logger(log_file, task) dataset_prediction_task_to_outcomes = { 'all_one_trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'pie'], }, 'has_single_src': { 'two': [True, False] }, 'num_x_axes': { 'numeric': [i for i in range(5)] }, 'num_y_axes': { 'numeric': [i for i in range(5)] } } field_prediction_task_to_outcomes = { 'trace_type': { 'two': ['line', 'bar'], 'three': ['line', 'scatter', 'bar'], 'six': ['line', 'scatter', 'bar', 'box', 'histogram', 'heatmap'], }, 'is_xsrc': { 'two': [True, False] }, 'is_ysrc': { 'two': [True, False] }, 'is_x_or_y': { 'two': ['x', 'y'] }, 'is_single_src': { 'two': [True, False] } } if task['dataset'] == 'dataset': task['features_df_file_name'] = 'features_aggregate_single_pairwise.csv' task['outcomes_df_file_name'] = 'chart_outcomes.csv' task['id_field'] = 'fid' prediction_task_to_outcomes = dataset_prediction_task_to_outcomes else: assert task['dataset'] == 'field' task['features_df_file_name'] = 'field_level_features.csv' task['outcomes_df_file_name'] = 'field_level_outcomes.csv' task['id_field'] = 'field_id' prediction_task_to_outcomes = field_prediction_task_to_outcomes features_df = pd.read_csv( join(features_directory, task['features_df_file_name']), nrows=num_datapoints) outcomes_df = pd.read_csv( join(features_directory, task['outcomes_df_file_name']), nrows=num_datapoints) feature_names_by_type = pickle.load( open( join(features_directory, feature_set_lookup_file_name), 'rb')) # print(features_df) # print('Initial Features:', features_df.shape) # print('Initial Outcomes:', outcomes_df.shape) # load_logger.log_dict(feature_names_by_type) # load_logger.log('\n') # load_logger.log(features_df) load_logger.log('Initial Features: ' + str(features_df.shape)) load_logger.log('Initial Outcomes: ' + str(outcomes_df.shape)) if task['dataset'] == 'field': def is_x_or_y(is_xsrc, is_ysrc): if is_xsrc and pd.isnull(is_ysrc): return 'x' if is_ysrc and pd.isnull(is_xsrc): return 'y' else: return None outcomes_df['is_x_or_y'] = np.vectorize(is_x_or_y)(outcomes_df['is_xsrc'], outcomes_df['is_ysrc']) outcomes_df['is_single_src'] = outcomes_df['is_single_xsrc'] | outcomes_df['is_single_ysrc'] outcomes_df_subset = format_outcomes_df(load_logger, outcomes_df, task['outcome_variable_name'], prediction_task_to_outcomes[ task['outcome_variable_name'] ] [task['prediction_task'] ], id_field=task['id_field']) final_df = join_features_and_outcomes(features_df, outcomes_df_subset, on=task['id_field']) last_index = final_df.columns.get_loc(task['outcome_variable_name']) X = final_df.iloc[:, :last_index] y = final_df.iloc[:, last_index] # print('Intermediate Outcomes:', y.shape) # value_counts = y.value_counts() # print('Value counts:') # print(value_counts) load_logger.log('Final DF Shape: ' + str(final_df.shape)) load_logger.log('Last Index: ' + str(last_index)) load_logger.log('Intermediate Outcomes: ' + str(y.shape)) load_logger.log('Value counts: \n' + str(y.value_counts())) # delete variables to save memory! del final_df, outcomes_df task_types = ['dimensions', 'types', 'values', 'names'] for task_name in task_types: names = get_feature_set_names_by_type( feature_names_by_type, task_type=task['dataset'], feature_set=task_name) indices = [X.columns.get_loc(c) for c in names if c in X.columns] # print('task is ' + task_name + ' and indices are:') #print('names are {}'.format(names) ) # print(indices) # load_logger.log('task is ' + task_name + ' and indices are: ') # load_logger.log(indices) y = pd.get_dummies(y).values.argmax(1) if task['sampling_mode'] == 'over': res = RandomOverSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif task['sampling_mode'] == 'under': res = RandomUnderSampler(random_state=RANDOM_STATE) X, y = res.fit_sample(X, y) elif isinstance(task['sampling_mode'], int): X_resampled_arrays, y_resampled_arrays = [], [] for outcome in np.unique(y): outcome_mask = (y == outcome) X_resampled_outcome, y_resampled_outcome = resample( X[outcome_mask], y[outcome_mask], n_samples=task['sampling_mode'], random_state=RANDOM_STATE ) X_resampled_arrays.append(X_resampled_outcome) y_resampled_arrays.append(y_resampled_outcome) X, y = np.concatenate(X_resampled_arrays).astype( np.float64), np.concatenate(y_resampled_arrays) else: X, y = X.values.astype(np.float64), y # print('Final Features:', X.shape) # print('Final Outcomes:', y.shape) load_logger.log('Final Features:' + str(X.shape)) load_logger.log('Final Outcomes:' + str(y.shape)) unique, counts = np.unique(y, return_counts=True) load_logger.log('Value counts after sampling:') load_logger.log_dict(dict(zip(unique, counts))) load_logger.log('\n') del load_logger return util.unison_shuffle(X, y)
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset=dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset=deal_with_nulls(dealing_with_nulls,dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors=dataset.drop(resp_var,axis=1,inplace=False) predictors=pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp=dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds,resp) #-------get the important predictors feature_imp = pd.Series(rf_clf.feature_importances_, index=list(predictors.iloc[:,0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp=pd.DataFrame(data=resp,columns=[resp_var]) predictors=pd.DataFrame(prds,columns=list(predictors)) dataset=pd.concat([resp,predictors],axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data= dataset.drop(resp_var, axis = 1,inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402) # Instantiate model with n_estimators decision trees clf = SVC(kernel='rbf',probability=True) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels,predicted)) #precision score precision = precision_score(test_labels,predicted,pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels,pred_prob[:,[1]]) #recall score rec = recall_score(test_labels,predicted,pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels,predicted,pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels,predicted,beta=0.5) #hamming_loss hamming = hamming_loss(test_labels,predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels,predicted) #logloss logloss = log_loss(test_labels,predicted) #zero-oneloss zero_one = zero_one_loss(test_labels,predicted) #auc roc area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]]) #cohen_score cohen = cohen_kappa_score(test_labels,predicted) #mathews corr mathews = matthews_corrcoef(test_labels,predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews} output=json.dumps(output) return jsonify({"Predictions": output})
clf_name = "knn" resultFolder = "/home/sherlock/Internship@iit/exudate-detection/" + clf_name + "_results-exudates/" clf = KNeighborsClassifier(n_neighbors=10) clf.fit(X_train, Y_train) Y_predicted = clf.predict(X_test) print("accuracy") print(accuracy_score(Y_test, Y_predicted)) print("confusion matrix") print(confusion_matrix(Y_test, Y_predicted)) writeResults(DestinationFolder, resultFolder, name_array, clf_name, Y_predicted) print("DONE_-------------------x----xxxxx-xx-x") from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train) print("when balanced classes : ") print(sorted(Counter(Y_resampled).items())) print("RANDOM FOREST") clf_name = "rf" resultFolder = "/home/sherlock/Internship@iit/exudate-detection/" + clf_name + "_BAL_results-exudates/" clf = RandomForestClassifier(n_estimators=10) clf.fit(X_resampled, Y_resampled) Y_predicted = clf.predict(X_test) print("accuracy") print(accuracy_score(Y_test, Y_predicted)) print("confusion matrix") print(confusion_matrix(Y_test, Y_predicted)) writeResults(DestinationFolder, resultFolder, name_array, clf_name,
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
encoder = preprocessing.LabelEncoder() encoder.fit(target) encoded_Y = encoder.transform(target) # convert integers to dummy variables (i.e. one hot encoded) dummy_y = np_utils.to_categorical(encoded_Y) # separate data into training and (validation + testing) datasets in a 70/30 (20/10) proportion X_train, X_partial, y_train, y_partial = train_test_split( features, dummy_y, test_size=0.3, random_state=rand_state) X_val, X_test, y_val, y_test = train_test_split(X_partial, y_partial, test_size=0.33, random_state=rand_state) # Oversample the training data ros = RandomOverSampler(sampling_strategy='minority', random_state=12) rus = RandomUnderSampler(random_state=12, replacement=True) # X_train_res, y_train_res = ros.fit_resample(X_train, y_train) X_train_res, y_train_res = rus.fit_resample(X_train, y_train) # Reobtain the correct training, validation and testing datasets X_train_reduced = X_train_res.loc[:, features_list] y_train_reduced = y_train_res #X_train_res.loc[:, targets_list] #Sim, X_train_res está correto X_val_reduced = X_val.loc[:, features_list] y_val_reduced = y_val # X_val.loc[:, targets_list] X_test_reduced = X_test.loc[:, features_list] y_test_reduced = y_test #X_test.loc[:, targets_list] # Samples no_zeros giving it the same number of values for all vb_slice ranges