def resample_train_data(train_data, n, frac): numeric_attrs = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',] #numeric_attrs = train_data.drop('y',axis=1).columns pos_train_data_original = train_data[train_data['y'] == 1] pos_train_data = train_data[train_data['y'] == 1] new_count = n * pos_train_data['y'].count() neg_train_data = train_data[train_data['y'] == 0].sample(frac=frac) train_list = [] if n != 0: pos_train_X = pos_train_data[numeric_attrs] pos_train_X2 = pd.concat([pos_train_data.drop(numeric_attrs, axis=1)] * n) pos_train_X2.index = range(new_count) s = smote.Smote(pos_train_X.values, N=n, k=3) pos_train_X = s.over_sampling() pos_train_X = pd.DataFrame(pos_train_X, columns=numeric_attrs, index=range(new_count)) pos_train_data = pd.concat([pos_train_X, pos_train_X2], axis=1) pos_train_data = pd.DataFrame(pos_train_data, columns=pos_train_data_original.columns) train_list = [pos_train_data, neg_train_data, pos_train_data_original] else: train_list = [neg_train_data, pos_train_data_original] print("Size of positive train data: {} * {}".format(pos_train_data_original['y'].count(), n+1)) print("Size of negative train data: {} * {}".format(neg_train_data['y'].count(), frac)) train_data = pd.concat(train_list, axis=0) return shuffle(train_data)
def __init__(self, sql_code='select * from wepon_d1', smote_pro=True, smote_k=5): conn = cx.connect('coupon/coupon@pai_db') self.sql_code = sql_code df = pd.read_sql(sql_code, conn) X1 = df[x_columns] y1 = df[target] s_label_count = [(i, y1[y1 == i].count()) for i in y1.unique()] y1 = np.asarray(y1) X1 = X1.applymap(lambda x: 0 if x is None else x) X1 = X1.applymap(lambda x: 0 if np.isnan(x) else x) X1 = X1.applymap(lambda x: x * 1.0) datas = np.asanyarray(X1, np.float32) if smote_pro: s_label_count.sort(key=lambda x: x[1], reverse=True) _, max_label_count = s_label_count[0] for s_label, s_count in s_label_count[1:]: s = smote.Smote(N=math.ceil(max_label_count / s_count), k=smote_k) smotedata = s.fit_transform(np.asarray(datas[y1 == s_label])) datas = np.vstack( [datas, smotedata[:max_label_count - s_count]]) add_y1 = np.zeros(max_label_count - s_count, dtype='int32') add_y1[:] = s_label y1 = np.hstack([y1, add_y1]) self.datas = datas self.labels = np.zeros([len(y1), 2]) self.labels[:, 0] = np.where(y1 == 0, 1, 0) self.labels[:, 1] = np.where(y1 == 1, 1, 0)
if __name__ == '__main__': data = loaddata() data = preprocess(data) x_train, x_test, y_train, y_test = train_test_split(data['cus_comment'], data['target'], random_state=3, test_size=0.25) stopwords = load_stopwords() x_train_fenci = fenci(x_train) tf = feature_extraction(x_train_fenci, stopwords) x_train_tf = tf.transform(x_train_fenci).toarray() samples0 = [] for i, label in enumerate(y_train): if label == 0: samples0.append(x_train_tf[i]) s = smote.Smote(np.array(samples0), N=600) over_samplings_x = s.over_sampling() total_samplings_x = np.row_stack((x_train_tf, over_samplings_x)) total_samplings_y = np.concatenate( (y_train, np.zeros(len(over_samplings_x))), axis=0) #model=train_model(x_train_tf, y_train,tf.transform(fenci(x_test)),y_test) model = train_model(total_samplings_x, total_samplings_y, tf.transform(fenci(x_test)), y_test) y_predict = model.predict(tf.transform(fenci((x_test)))) comment1 = "一如既往的好。已经快成了陆家嘴上班的我的食堂了。满减活动非常给力,上次叫了八样东西,折扣下来居然就六十左右,吃得好爽好爽。南瓜吃过几次,就一次不够酥烂,其他几次都很好。烤麸非常入味,适合上海人。鱼香肉丝有点辣,下饭刚好。那个蔬菜每次都点。总体很好吃。" comment2 = "糯米外皮不绵滑,豆沙馅粗躁,没有香甜味。12元一碗不值。" print(predict(model, pd.Series([comment1]), tf)) print(predict(model, pd.Series([comment2]), tf))
plt.title("Eigen Values of the Principal Components") plt.xlim((0, 30)) # %% PCA_data.columns = ("PC" + str(i) for i in range(1, 8)) print("Shape of the Feature Matrix after PCA is:", PCA_data.shape) print("PVE of the chosen PC's are:", pc_analyser.calc_PVE(m=7)) PCA_data = pd.concat([clean_Y, PCA_data], axis=1) # %% [markdown] # ### SMOTE # %% minority = PCA_data[PCA_data["Bankrupt?"] == 1] # Extract minority samples from data smt = smote.Smote(minority.to_numpy()) # Initialize the SMOTE class oversamples = smt.oversample(N=2600) # Employ SMOTE oversampling # %% smote_data = PCA_data.copy( deep=True) # Cleared from outliers and dim reduced by PCA. Now oversample oversamples_pd = pd.DataFrame(oversamples, columns=PCA_data.columns) smote_data = smote_data.append(oversamples_pd) smote_data = smote_data.reset_index(drop=True) # %% unstable_smote = (smote_data["Bankrupt?"] == 1).sum() stable_smote = (smote_data["Bankrupt?"] == 0).sum() print("Oversampled Data Size:", smote_data.shape[0]) print("Number of Stable Companies:", stable_smote) print("Number of Unstable Companies (with SMOTE):", unstable_smote)