def get_feature_upsampling(): df = pd.read_csv("/home/liyulian/websafetyL/data/fraud/creditcard.csv") df['normAmount'] = StandardScaler().fit_transform( df['Amount'].values.reshape(-1, 1)) df = df.drop(['Time', 'Amount'], axis=1) y = df['Class'] features = df.drop(['Class'], axis=1).columns x = df[features] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) print("raw data") print(pd.value_counts(y_train)) os = SMOTE(random_state=0) x_train_1, y_train_1 = os.fit_sample(x_train, y_train) print("Smote data") print(pd.value_counts(y_train_1)) return x_train, x_test, y_train, y_test
def get_feature_upsampling(): df = pd.read_csv("../data/fraud/creditcard.csv") df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1)) df = df.drop(['Time', 'Amount'], axis=1) y = df['Class'] features = df.drop(['Class'], axis=1).columns x = df[features] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) print "raw data" print pd.value_counts(y_train) os = SMOTE(random_state=0) x_train_1,y_train_1=os.fit_sample(x_train,y_train) print "Smote data" print pd.value_counts(y_train_1) return x_train, x_test, y_train, y_test
# a. With all features from RFE # b. With select features from RFE # 2) Regular model # ### Model 1a: Over-Sampling (All Features) # In[41]: os = SMOTE(random_state=0) Xceo_train, Xceo_test, yceo_train, yceo_test = train_test_split(Xceo, yceo, test_size=0.5, random_state=0) columns = Xceo_train.columns os_ceo_X, os_ceo_y = os.fit_sample(Xceo_train, yceo_train) os_ceo_X = pd.DataFrame(data=os_ceo_X, columns=columns) os_ceo_y = pd.DataFrame(data=os_ceo_y, columns=['label']) # In[42]: print("length of oversampled ceos is ", len(os_ceo_X)) print("Number of non-CEOs in oversampled ceos", len(os_ceo_y[os_ceo_y['label'] == 0])) print("Number of CEOs", len(os_ceo_y[os_ceo_y['label'] == 1])) print("Proportion of non-ceos in oversampled ceos is ", len(os_ceo_y[os_ceo_y['label'] == 0]) / len(os_ceo_X)) print("Proportion of ceos in oversampled ceos is ", len(os_ceo_y[os_ceo_y['label'] == 1]) / len(os_ceo_X)) # In[43]:
np.sum(y_train == True) # 16391 np.sum(y_train == False) # 1504449 np.sum(y_test == True) # 7490 np.sum(y_test == False) # 689628 train_col_names = X_train.columns # over-sampling using SMOTE-Synthetic Minority Oversampling Technique from imblearn.over_sampling import SMOTE os = SMOTE(random_state=0) os_data_X, os_data_y = os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=train_col_names) os_data_y = pd.DataFrame(data=os_data_y, columns=['y']) # check the lengths of data now os_data_X.shape # (2996702, 55) len(os_data_y) # 2996702 # percent of True n_total = len(os_data_y) n_true = sum(os_data_y['y'] == True) n_true # 1498351 (before oversampling: 23881) n_false = sum(os_data_y['y'] == False)
def __init__(self, getfile, test_num): #-----SPLIT DATASETS------- self.getfile = getfile self.test_num = test_num tested = pd.read_csv(self.getfile) x = tested.iloc[:, [5, 6]].values # output y = tested.iloc[:, 7].values xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size=self.test_num, random_state=42) print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape) #testx = len(xtest) #print(testx) #4 Feature Scaling #Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data. # It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm. sc_x = StandardScaler() xtrain = sc_x.fit_transform(np.asarray(xtrain)) xtest = sc_x.transform(np.asarray(xtest)) counter = Counter(y) #---------------SMOTE ALGORITHM-------------------------- print("Before OverSampling, counts of label '1': {}\n".format( sum(ytrain == 1))) print("Before OverSampling, counts of label '-1': {} \n".format( sum(ytrain == -1))) print('WITH SMOTE') os = RandomOverSampler(sampling_strategy='minority') xtrain_res, ytrain_res = os.fit_sample(x, y) oversample = SMOTE() xtrain, ytrain = oversample.fit_resample(xtrain_res, ytrain_res.ravel()) counter = Counter(ytrain) print(counter) print('After OverSampling, the shape of train_X: {}'.format( xtrain.shape)) print('After OverSampling, the shape of train_y: {} \n'.format( ytrain.shape)) print("After OverSampling, counts of label '1': {}".format( sum(ytrain == 1))) print("After OverSampling, counts of label '-1': {}".format( sum(ytrain == -1))) #---------------LOGISTIC REGRESSION---------------------- #5 Fitting the Logistic Regression to the Training Set: #We create a classifier object of LR class classifier = LogisticRegression() #Fit logistic regression model to the training set (Xtrain and ytrain) classifier.fit(xtrain, ytrain) #vget = classifier.vard #print(vget) #6 Predicting the Test set results #Using predict method for the classifier object and put Xtest for #argument y_pred = classifier.predict(xtest) #print(y_pred) posed = 1 neued = 1 neged = 1 import MySQLdb mydb = MySQLdb.connect(host="127.0.0.1", user="******", password="", database="logitregression_data") mycursor = mydb.cursor() logit = [] with open('temp_file.csv', 'r') as tempo: read = csv.reader(tempo, delimiter=',') for tem in read: logit.append(tem) with open(getfile, 'r') as file: reader = csv.reader(file, delimiter=',') all_value = [] counter = 0 mycursor.execute("DELETE FROM hybrid_logitval") #-----------The Result On The Logistic Regression Process Based on the Number of Test size will be seperated and determine the overall Result-------------- for over in y_pred: counter += 1 if over == 1: posed += 1 resu = 'Positive' regval = 1 elif over == 0: neued += 1 resu = 'Neutral' regval = 0 else: neged += 1 resu = 'Negative' regval = -1 #stregval = str(regval) #valued = (counter,over,stregval, resu) query2 = "INSERT INTO `hybrid_logitval`(`HYB_ID`, `HYB_VALUE`, `HYB_SENTIMENT`, `HYB_RESULT`) VALUES (%s,%s,%s,%s)" mycursor.execute(query2, (counter, logit[counter], regval, resu)) for row in reader: #print(row[0]) value = (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]) all_value.append(value) mycursor.execute("DELETE FROM `baseline`") query = "INSERT INTO `baseline`(`ID`, `TWEETS`, `TOKENIZED`, `STOP_WORDS`, `STEMMED`, `POLARITY`, `SUBJECTIVITY`, `SENTIMENT`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)" mycursor.executemany(query, all_value) mycursor.execute("DELETE FROM `baseline` WHERE `baseline`.`ID` = 0") mydb.commit() mydb.close() #---------------CONFUSION MATRIX---------------------- #7 Making the Confusion Matrix. It contains the correct and incorrect predictions of our model #ytest parameter will be y_test #y_pred is the logistic regression model prediction cm = confusion_matrix(ytest, y_pred) import warnings warnings.filterwarnings("ignore") cr = classification_report(ytest, y_pred) print(ytest) print("Confusion Matrix : \n", cm) print(cr) import mlxtend.plotting from mlxtend.plotting import plot_confusion_matrix class_names = ['-1', '0', '1'] fig, ax = plot_confusion_matrix(conf_mat=cm, colorbar=True, class_names=class_names) fig.canvas.set_window_title('HYBRID LOGISTIC REGRESSION') plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() #-------SENDS ALL VALUES TO APPEAR ON THE USER INTERFACE---------------- global accurate, confuse, posi, neut, nega, overall, plots, replot, percentage, reports accurate = accuracy_score(ytest, y_pred) print(accurate) percentage = "{:.0%}".format(accurate) confuse = cm print(percentage) posi = posed neut = neued nega = neged plots = y_pred replot = plt reports = cr if (neut >= posi) and (neut >= nega): overall = 'NEUTRAL' elif (posi >= neut) and (posi >= nega): overall = 'POSITIVE' else: overall = 'NEGATIVE' print(overall)
#1520840 np.sum(y_train == True) # 16391 np.sum(y_train == False) # 1504449 np.sum(y_test == True) # 7490 np.sum(y_test == False) # 689628 train_col_names = X_train.columns # over-sampling using SMOTE-Synthetic Minority Oversampling Technique from imblearn.over_sampling import SMOTE os = SMOTE(random_state=0) os_data_X, os_data_y = os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=train_col_names) os_data_y = pd.DataFrame(data=os_data_y, columns=['y']) # check the lengths of data now os_data_X.shape # (2996702, 55) len(os_data_y) # 2996702 # percent of True n_total = len(os_data_y) n_true = sum(os_data_y['y']==True) n_true # 1498351 (before oversampling: 23881) n_false = sum(os_data_y['y']==False)
telcom = telcom.drop(columns=num_cols, axis=1) telcom = telcom.merge(scaled, left_index=True, right_index=True, how="left") from imblearn.over_sampling import SMOTE cols = [i for i in telcom.columns if i not in Id_col + target_col] smote_X = telcom[cols] smote_Y = telcom[target_col] #Split train and test data smote_train_X, smote_test_X, smote_train_Y, smote_test_Y = train_test_split( smote_X, smote_Y, test_size=.25, random_state=111) #oversampling minority class using smote os = SMOTE(random_state=0) os_smote_X, os_smote_Y = os.fit_sample(smote_train_X, smote_train_Y) os_smote_X = pd.DataFrame(data=os_smote_X, columns=cols) os_smote_Y = pd.DataFrame(data=os_smote_Y, columns=target_col) #splitting train and test data train, test = train_test_split(telcom, test_size=.25, random_state=111) ##seperating dependent and independent variables cols = [i for i in telcom.columns if i not in Id_col + target_col] train_X = train[cols] train_Y = train[target_col] test_X = test[cols] test_Y = test[target_col] # # 3. Common function for model prediction
# In[19]: X = X[columns1] #y = y[target] print(X.shape) print(y.shape) # In[20]: OVERSAMPLING = True if OVERSAMPLING: os = RandomOverSampler() X_res,y_res=os.fit_sample(X,y) else: X_res = X y_res = y # In[21]: #split into train and validation data from sklearn.model_selection import train_test_split X_train, X_val, Y_train, Y_val = train_test_split(X_res, y_res, test_size = 0.2, random_state = 0, stratify = y_res) print(X_train.shape, X_val.shape) # If oversampling works, these should both print 0.5 print(np.average(Y_train))
from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.linear_model import LogisticRegression from sklearn.grid_search import GridSearchCV from imblearn.over_sampling import RandomOverSampler train_data = data[data['TT']=='train'] x = train_data.drop(['TT','student_id','grant_amount','class_rank'],axis = 1) y = train_data['grant_amount'] test_data = data[data['TT']=='test'] test_x = test_data.drop(['TT','student_id','grant_amount','class_rank'],axis = 1) test_y = test_data['grant_amount'] os = RandomOverSampler(ratio=1.0) X_overs, y_overs = os.fit_sample(x, y) data_train, data_test, target_train, target_test = cross_validation.train_test_split(X_overs, y_overs) xgb1 = XGBClassifier( learning_rate=0.02, n_estimators=820, max_depth=3, min_child_weight=0.5, gamma=0.01, subsample=0.7, colsample_bytree=0.7, colsample_bylevel=0.6, objective='multi:softmax', seed=10, nthread=8,
# Our dataset is strongly imbalanced (~4000 accepted vs 57000 not-accepted) # Use oversampling to generate pseudodata # Take 20% of dataset as test-set, use the rest for training X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1) columns = X_train.columns X_test_orig = X_test y_test_orig = y_test # Create oversampled dataset os = SMOTE(random_state=1) os_data_X_train, os_data_y_train = os.fit_sample(X_train, y_train.values.ravel()) os_data_X_train = pd.DataFrame(data=os_data_X_train, columns=columns) os_data_y_train = pd.DataFrame(data=os_data_y_train, columns=['request_status']) os_data_X_test, os_data_y_test = os.fit_sample(X_test, y_test.values.ravel()) os_data_X_test = pd.DataFrame(data=os_data_X_test, columns=columns) os_data_y_test = pd.DataFrame(data=os_data_y_test, columns=['request_status']) # Plausibility check for oversampling print( "\nApply oversampling to get equal ratio of acceptance/non-acceptance:\n") print("New length of our oversampled dataset is ", len(os_data_X_train)) print("Number of non-acceptance in oversampled dataset", len(os_data_y_train[os_data_y_train['request_status'] == 0])) print("Number of acceptance in oversampled dataset",
if(b>0.5): print('Churn') else: print('No Churn') from sklearn.metrics import confusion_matrix , classification_report print(classification_report(y_test,y_pred)) from imblearn.combine import SMOTETomek from imblearn.under_sampling import NearMiss from imblearn.over_sampling import RandomOverSampler from collections import Counter os=SMOTETomek() X_train_ns,y_train_ns=os.fit_sample(X_train,y_train) print("The number of classes before fit {}".format(Counter(y_train))) print("The number of classes after fit {}".format(Counter(y_train_ns))) model1 = Sequential([ Dense(19, input_shape=(19,), activation='relu'), Dense(10, activation='relu'), Dense(1, activation='sigmoid') ]) model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) rs=model1.fit(X_train_ns, y_train_ns, epochs=200,validation_data = (X_test,y_test))
random_state=0) ## Scaling(seperate for train and test set) scaler = preprocessing.StandardScaler() scaled_values_train = scaler.fit_transform(X_train) scaled_values_test = scaler.fit_transform(X_test) X_scaled = scaled_values_train columns = X.columns X_test_scaled = pd.DataFrame(data=scaled_values_test, columns=columns) ##Over-sampling training data using SMOTE os = SMOTE(random_state=0) columns = X.columns os_X_train, os_y_train = os.fit_sample(X_scaled, y_train) os_data_X = pd.DataFrame(data=os_X_train, columns=columns) os_data_y = pd.DataFrame(data=os_y_train, columns=['y']) # Oversampling report print("\nBalancing data with synthetic data..") print("\nLength of synthetic training data:", (len(os_data_X) - len(X_train))) print("Length of original training data:", len(X_train)) print("Length of oversampled training data:", len(os_data_X)) print("Proportion of negative examples in original data:", round(len(y_train[y_train == 0]) / len(y_train), 2)) print("Proportion of negative examples in oversampled data:", len(os_data_y[os_data_y['y'] == 0]) / len(os_data_X)) # Set the parameters by cross-validation