def format_X_y(pos_file, neg_file, flag): #flag = 'train' or 'test' #print pos_file, neg_file X, y = [], [] for line in open(pos_file): try: x = [float(item) for item in line.strip().split()] X.append(x) y.append(1) except: print line if flag == 'train': #using SMOTE to over sample the positive feature vectors. X = SMOTE.smote(X, 15, 3) length1 = len(X) - len(y) y += [1] * length1 #down sample part tempX, tempy = [], [] for line in open(neg_file): try: x = [float(item) for item in line.strip().split()] tempX.append(x) tempy.append(0) except: print line '''if flag == 'train': tempX = SMOTE.downsample(tempX,0.5) tempy = tempy[:len(tempX)]''' #print float(len(tempy))/float(len(y)) X += tempX y += tempy return X, y
def apply_smote(df): df.reset_index(drop=True, inplace=True) cols = df.columns smt = SMOTE.smote(df) df = smt.run() df.columns = cols return df
def cross_val(data, labels, k, smote, classifier): """ Performs k-fold cross validation using the specified classifier, returns number of true/false positives/negatives """ kf = KFold(n_splits=k) tp, fp, fn, tn = 0, 0, 0, 0 i = 0 for train_index, test_index in kf.split(data): test_set, train_set, test_label, train_label = [], [], [], [] # make train and test sets/labels for i in train_index: train_set.append(data[i]) train_label.append(labels[i]) for i in test_index: test_set.append(data[i]) test_label.append(labels[i]) # Apply SMOTEing when smote parameter is True if smote: train_set, train_label = SMOTE.SMOTEd(train_set, train_label) if classifier == 'linear': predicted = classifiers.lin_reg(train_set, test_set, train_label) elif classifier == 'logistic': predicted = classifiers.log_reg(train_set, test_set, train_label) elif classifier == 'decision tree': predicted = classifiers.decision_tree(train_set, test_set, train_label) elif classifier == 'neuralnetwork': predicted = classifiers.neuralnetwork(train_set, test_set, train_label) elif classifier == 'naive bayes': predicted = classifiers.naive_bayes(train_set, test_set, train_label) elif classifier == 'randomforest': predicted = classifiers.randomforest(train_set, test_set, train_label) elif classifier == 'knn': predicted = classifiers.knn(train_set, test_set, train_label) else: print 'Wrong name supplied: %s' % classifier return [test_label, predicted]
def format_X_y(pos_file, neg_file,flag): #flag = 'train' or 'test' #print pos_file, neg_file X, y = [], [] for line in open(pos_file): try: x = [float(item) for item in line.strip().split()] X.append(x) y.append(1) except: print line if flag == 'train': #using SMOTE to over sample the positive feature vectors. X=SMOTE.smote(X,15,3) length1 = len(X)-len(y) y+=[1]*length1 #down sample part tempX,tempy = [],[] for line in open(neg_file): try: x = [float(item) for item in line.strip().split()] tempX.append(x) tempy.append(0) except: print line '''if flag == 'train': tempX = SMOTE.downsample(tempX,0.5) tempy = tempy[:len(tempX)]''' #print float(len(tempy))/float(len(y)) X+=tempX y+=tempy return X, y
def apply_smote(self, df): cols = df.columns smt = SMOTE.smote(df) df = smt.run() df.columns = cols return df
'gamma': gammaRange, 'C': cRange, 'class_weight': classWeightRange, 'decision_function_shape': ['ovr'] # ['ovo', 'ovr', None] }] scores = ['f1', 'f1_macro'] # ['f1_macro', 'precision_macro', 'f1_micro'] # train_counts = count_vectorizer.fit_transform(train_corpus) # vect = DictVectorizer() # train_counts = vect.fit_transform(features(tokenize, d) for d in train_corpus) train_dict = [features(tokenize, d) for d in train_corpus] newMinoritySamples = SMOTE.smoteAlgo(getMinoritySamples( train_dict, train_labels), rate=4, k=100, random_seed=RANDOMSEED) train_dict = train_dict + newMinoritySamples train_labels = train_labels + [1] * len(newMinoritySamples) vect = DictVectorizer() train_counts = vect.fit_transform(train_dict) (train_counts, train_labels) = shuffle(train_counts, train_labels, random_state=RANDOMSEED) for score in scores: print("# Tuning hyper-parameters for %s" % score) print()
def createSMOTEsamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel): global synX, synY (synX,synY) = SMOTE.createSyntheticSamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel)
import pandas as pd import SMOTE as sm df = pd.read_csv('sample.csv', header=None) # Simple pre-processing function to get the desired format for the dataset def pre_processing(dataset): d = dataset.T return [list(d[i]) for i in d] df = pre_processing(df) minority = df[50:75] # Use all 25 class 'B' data as the input dataset # The SMOTE function is labelled as augment() syn = sm.augment(minority, 50, 5) print(syn) print(len(syn)) # syn = sm.augment(minority, 100, 7) # print(syn[0:25]) # print(syn[25:50])
def templet(sampler_name, sample_ratio): """ 模板方法 :param sampler_name: 采样算法名 :param sample_ratio: 采样比例 :return: """ dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) sb = None if sampler_name == 'CART': sb = DummySampler() elif sampler_name == 'SMOTE': sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Border1': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline1') elif sampler_name == 'Border2': sb = BorderSMOTE(N=sample_ratio, m_neighbors=9, k_neighbors=5, random_state=42, kind='borderline2') elif sampler_name == 'ADASYN': sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42) elif sampler_name == 'Safe-level': sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42) else: pass X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # 采样 model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) # write2dic fill_dic('precision', sampler_name, sample_ratio, precision) fill_dic('recall', sampler_name, sample_ratio, recall) fill_dic('f1', sampler_name, sample_ratio, f1) fill_dic('auc', sampler_name, sample_ratio, auc) fill_dic('gmean', sampler_name, sample_ratio, gmean) print('%s %.1f building id transforming took %fs!' % (sampler_name, sample_ratio, time.time() - start_time))
def createSMOTEsamples(X, Y, nearestneigh, numNeighbors, majoritylabel, minoritylabel): global synX, synY (synX, synY) = SMOTE.createSyntheticSamples(X, Y, nearestneigh, numNeighbors, majoritylabel, minoritylabel)
def test(): dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}, 'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}} results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean']) # 加载数据 dataset = fetch_datasets()['satimage'] X = dataset.data y = dataset.target print(Counter(y)) # 随机种子,保证每次实验结果相同 np.random.seed(42) # -------------------------------------------CART---------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证CART cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # initialize CART cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # 归一化 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 cart.fit(X_train_minmax, y[train]) # 测试 predict = cart.predict(X_test_minmax) probability = cart.predict_proba(X_test_minmax) cart_auc = metrics.roc_auc_score(y[test], probability[:, 1]) cart_precision = metrics.precision_score(y[test], predict) cart_recall = metrics.recall_score(y[test], predict) if cart_precision == 0: cart_f1 = 0 else: cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall) cart_gmean = geometric_mean_score(y[test], predict) dic['precision']['CART'].append(cart_precision) dic['recall']['CART'].append(cart_recall) dic['f1']['CART'].append(cart_f1) dic['auc']['CART'].append(cart_auc) dic['gmean']['CART'].append(cart_gmean) print('CART building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------------SMOTE---------------------------------------------------------- # 起始时间 start_time = time.time() # 交叉验证 cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42) for train, test in cv.split(X, y): # preprocess scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # initialize sampler sb = SMOTE(N=100, k_neighbors=5, random_state=42) # sampling X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) # initialize classifier model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # model = svm.SVC(class_weight={1: 20}) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['SMOTE'].append(precision) dic['recall']['SMOTE'].append(recall) dic['f1']['SMOTE'].append(f1) dic['auc']['SMOTE'].append(auc) dic['gmean']['SMOTE'].append(gmean) print('SMOTE building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE1---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border1'].append(precision) dic['recall']['Border1'].append(recall) dic['f1']['Border1'].append(f1) dic['auc']['Border1'].append(auc) dic['gmean']['Border1'].append(gmean) print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------Borderline-SMOTE2---------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 初始化采样器 sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2') # 采样 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Border2'].append(precision) dic['recall']['Border2'].append(recall) dic['f1']['Border2'].append(f1) dic['auc']['Border2'].append(auc) dic['gmean']['Border2'].append(gmean) print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time)) # ---------------------------------------------ADASYN--------------------------------------------------------------- # 起始时间 start_time = time.time() cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['ADASYN'].append(precision) dic['recall']['ADASYN'].append(recall) dic['f1']['ADASYN'].append(f1) dic['auc']['ADASYN'].append(auc) dic['gmean']['ADASYN'].append(gmean) print('ADASYN building id transforming took %fs!' % (time.time() - start_time)) # ------------------------------------------------Safe-Level-SMOTE---------------------------------------------- cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42) for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) # 训练 sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42) # 预测 X_res, y_res = sb.fit_sample(X_train_minmax, y[train]) model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) model.fit(X_res, y_res) predict = model.predict(X_test_minmax) probability = model.predict_proba(X_test_minmax)[:, 1] precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) dic['precision']['Safe-level'].append(precision) dic['recall']['Safe-level'].append(recall) dic['f1']['Safe-level'].append(f1) dic['auc']['Safe-level'].append(auc) dic['gmean']['Safe-level'].append(gmean) print('Safe-level building id transforming took %fs!' % (time.time() - start_time)) # display results.add_row(['CART', np.mean(np.array(dic['precision']['CART'])), np.mean(np.array(dic['recall']['CART'])), np.mean(np.array(dic['auc']['CART'])), np.mean(np.array(dic['f1']['CART'])), np.mean(np.array(dic['gmean']['CART']))]) results.add_row(['SMOTE', np.mean(np.array(dic['precision']['SMOTE'])), np.mean(np.array(dic['recall']['SMOTE'])), np.mean(np.array(dic['auc']['SMOTE'])), np.mean(np.array(dic['f1']['SMOTE'])), np.mean(np.array(dic['gmean']['SMOTE']))]) results.add_row(['Border1', np.mean(np.array(dic['precision']['Border1'])), np.mean(np.array(dic['recall']['Border1'])), np.mean(np.array(dic['auc']['Border1'])), np.mean(np.array(dic['f1']['Border1'])), np.mean(np.array(dic['gmean']['Border1']))]) results.add_row(['Border2', np.mean(np.array(dic['precision']['Border2'])), np.mean(np.array(dic['recall']['Border2'])), np.mean(np.array(dic['auc']['Border2'])), np.mean(np.array(dic['f1']['Border2'])), np.mean(np.array(dic['gmean']['Border2']))]) results.add_row(['ADASYN', np.mean(np.array(dic['precision']['ADASYN'])), np.mean(np.array(dic['recall']['ADASYN'])), np.mean(np.array(dic['auc']['ADASYN'])), np.mean(np.array(dic['f1']['ADASYN'])), np.mean(np.array(dic['gmean']['ADASYN']))]) results.add_row(['Safe-level', np.mean(np.array(dic['precision']['Safe-level'])), np.mean(np.array(dic['recall']['Safe-level'])), np.mean(np.array(dic['auc']['Safe-level'])), np.mean(np.array(dic['f1']['Safe-level'])), np.mean(np.array(dic['gmean']['Safe-level']))]) print(results)
def create_datasets(input_df, target_var, num_val_pos, num_val_neg, num_train_pos, num_train_neg, num_smote=None, num_train_sets=5, replace=True, SMOTE_random_sample=None): """ Automatically creates validation and train/test sets for you. INPUTS -------------------------------------------------------------------------- | TYPE | VARIABLE NAME | DESCRIPTION | -------------------------------------------------------------------------- pd.df | df | Starting dataframe | int | num_val_pos | # pos. values in validation dataframe | int | num_val_neg | # neg values in validation dataframe | int | num_train_pos | # pos. values in each train dataframe | if == "all", then you use all non-validation examples for each training set leading to an oversampling factor of num_train_sets. int | num_train_neg | # neg. values in each train dataframe | int | num_train_sets | # of training setss desires | -------------------------------------------------------------------------- RETURNS: -------------------------------------------------------------------------- | TYPE | VARIABLE NAME | DESCRIPTION | -------------------------------------------------------------------------- dict | train_pos | dict of dataframes of positive training obs | dict | train_neg | dict of dataframes of negative training obs | pd.df | val_pos | dataframe of positive testing obs | pd.df | val_neg | dataframe of negative testing obs | -------------------------------------------------------------------------- """ total_positives = (input_df[target_var]).sum() # if num_val_pos < 1: # num_val_pos = int(num_val_pos * total_positives) # if num_train_pos < 1: # num_train_pos = int(num_train_pos * total_positives) # if num_val_neg < 1: # num_val_neg = int(num_val_neg * total_positives) # if num_train_neg < 1: # num_train_neg = int(num_train_neg * total_positives) # # After defining num_val_positives, use the rest for training # if not num_train_pos: # num_train_pos = total_positives - num_val_pos # Get Positive, Negative Indices positive_indices = input_df[input_df[target_var] == 1].index.tolist() num_positives = len(positive_indices) negative_indices = input_df[input_df[target_var] == 0].index.tolist() num_negatives = len(negative_indices) # Create validation set print("Creating Validation sets...") start = time.time() val_positive_indices = np.random.choice(positive_indices, num_val_pos, replace=False) val_pos = input_df.copy().iloc[val_positive_indices, :] val_negative_indices = np.random.choice(negative_indices, num_val_neg, replace=False) val_neg = input_df.copy().iloc[val_negative_indices, :] # Remove Validation Set Elements from Train/Test Set Elements positive_indices = list(set(positive_indices)\ .difference(set(val_positive_indices))) negative_indices = list(set(negative_indices)\ .difference(set(val_negative_indices))) end = time.time() print("Completed in {}s\n".format(round(end - start, 1))) ## SMOTE new samples remaining_indices = positive_indices + negative_indices remaining_df = input_df.iloc[remaining_indices, :].reset_index(drop=True) if num_smote is not None and num_smote > 0: start = time.time() print("SMOTEing synthetic examples...") X_pos = remaining_df[remaining_df['target_var'] == 1].drop( [target_var], axis=1) if SMOTE_random_sample is not None and SMOTE_random_sample > 0: X_pos = X_pos.sample(SMOTE_random_sample) smoter = SMOTE() X_synth = smoter.generate( X_pos, None, num_smote, False, custom_SMOTE.match_columns, custom_SMOTE.smote_columns, ) y_synth = np.ones(X_synth.shape[0]).reshape(-1, 1) synths = pd.DataFrame(np.hstack((X_synth, y_synth)), columns=remaining_df.columns) new_df = pd.concat((remaining_df, synths)) new_df = new_df.convert_objects() end = time.time() print("Completed in {}s\n".format(round(end - start, 1))) else: new_df = remaining_df.copy() # Get indices of remaining samples positive_indices = new_df[new_df[target_var] == 1].index.tolist() negative_indices = new_df[new_df[target_var] == 0].index.tolist() # Create Train/Test Set Values print("Creating Train/Test sets...") start = time.time() if num_train_pos == 'all': train_positives = np.array(positive_indices)[:, np.newaxis].T train_positives = np.repeat(train_positives, num_train_sets, axis=0) else: train_positives = np.random.choice(positive_indices, size=(num_train_sets, num_train_pos)) train_negatives = np.random.choice(negative_indices, size=(num_train_sets, num_train_neg)) # Return Dataframes print("Returning Dataframes...") train_pos, train_neg = {}, {} for i in range(num_train_sets): set_name = "set_{}".format(i + 1) train_pos[set_name] = new_df.iloc[train_positives[i], :] train_neg[set_name] = new_df.iloc[train_negatives[i], :] end = time.time() print("Completed in {}s\n".format(round(end - start, 1))) print("Done") return train_pos, train_neg, val_pos, val_neg
def fit(self, X, y): # Determine the minority class label. stats_c_ = Counter(y) maj_c_ = max(stats_c_, key=stats_c_.get) self.majority_target = maj_c_ min_c_ = min(stats_c_, key=stats_c_.get) self.minority_target = min_c_ total_number = len(X) # Total number of instances in the training set pos_data = X[y == self.minority_target] neg_data = X[y == self.majority_target] pos_size = len(pos_data) # number of positive data neg_size = len(neg_data) # number of negative data # Reorganize TRAIN by putting all the positive and negative exampels together, respectively X_train = np.vstack([pos_data, neg_data]) y_train = np.array([self.minority_target] * pos_size + [self.majority_target] * neg_size) # weights stores the weights of the instances in each row for every iteration of boosting weights = np.zeros(shape=[self.n_estimator, X.shape[0]]) # Weights for all the instances are initialized by 1/m for the first iteration weights[0] = 1 / X.shape[0] t = 0 # Loop counter count = 0 # Keeps counts of the number of times the same boosting iteration have been repeated while t < self.n_estimator: # log message # logger.debug('Boosting iteration # %d' % t) # print('Boosting iteration # %d' % t) if self.class_dist is True: # Resampling positive_data with weights of positive example sum_pos_weights = np.sum(weights[t][:pos_size]) pos_weights = weights[t][:pos_size] / sum_pos_weights resample_pos = pos_data[np.random.choice(a=pos_size, size=pos_size, replace=True, p=pos_weights)] # Resampling negative with weights of negative example sum_neg_weights = np.sum(weights[t][pos_size:total_number]) neg_weights = weights[t][ pos_size:total_number] / sum_neg_weights resample_neg = neg_data[np.random.choice(a=neg_size, size=neg_size, replace=True, p=neg_weights)] # Resampled TRAIN is stored in RESAMPLED X_resampled = np.vstack([resample_pos, resample_neg]) y_resampled = np.array([self.minority_target] * pos_size + [self.majority_target] * neg_size) # Calulating the number of boosting the positive class syn_size = pos_size * self.rate else: # indices of resampled train random_index = np.random.choice(a=total_number, size=total_number, replace=True, p=weights[t]) # Resampled TRAIN is stored in RESAMPLED X_resampled = X_train[random_index] y_resampled = y_train[random_index] # Calulating the number of boosting the positive class pos_size = np.sum(y_resampled == self.minority_target) neg_size = np.sum(y_resampled == self.majority_target) syn_size = pos_size * self.rate # SMOTE step # self.smote.fit(X_resampled[y_resampled == self.minority_target]) # X_syn = self.smote.sample(syn_size) # y_syn = np.array([self.minority_target] * syn_size) smote = SMOTE(N=self.rate, k_neighbors=5, random_state=self.random_state) X_res, y_res = smote.fit_sample(X_resampled, y_resampled) # train classifier model = clone(self.weak_estimator) # if self.weak_estimator == 'decision tree': # model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42) # elif self.weak_estimator == 'svm': # model = svm.SVC(class_weight={1: 8}) # else: # pass model.fit(X_res, y_res) predict = model.predict(X_train) # Computing the pseudo loss of hypothesis 'model' incorrect = predict != y_train loss = np.mean(np.average(incorrect, weights=weights[t], axis=0)) # print(loss) # If count exceeds a pre-defined threshold (5 in the current implementation), # the loop is broken and rolled back to the state where loss > 0.5 was not encountered if count > 5: self.pseudo_loss = self.pseudo_loss[:t] self.estimator_weights_ = self.estimator_weights_[:t] self.estimators_ = self.estimators_[:t] print('Too many iterations have loss > 0.5') print('Aborting boosting') break if loss > 0.5: count = count + 1 continue else: count = 1 self.pseudo_loss.append(loss) # Pseudo-loss at each iteration self.estimators_.append(model) # Hypothesis function beta = loss / (1 - loss) # Setting weight update parameter 'beta'. self.estimator_weights_.append(np.log( 1 / beta)) # Weight of the hypothesis # At the final iteration there is no need to update the weights any further if t == self.n_estimator - 1: break # Updating weight weights[t + 1][y_train == predict] = weights[t][y_train == predict] * beta weights[t + 1][y_train != predict] = weights[t][y_train != predict] # Normalizing the weight for the next iteration sum_weights = np.sum(weights[t + 1]) weights[t + 1] /= sum_weights # Incrementing loop counter t = t + 1