def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ros = RandomOverSampler(random_state=RND_SEED) ros.fit(X, Y) assert_raises(RuntimeError, ros.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def oversample_data(): X, y = encoded_data.drop('Churn', axis=1), encoded_data[['Churn']] oversampler = RandomOverSampler(random_state=1) oversampler.fit(X, y) X_oversampled, y_oversampled = oversampler.fit_sample(X, y) return X_oversampled, y_oversampled
def test_ros_fit(): """Test the fitting method""" # Create the object ros = RandomOverSampler(random_state=RND_SEED) # Fit the data ros.fit(X, Y) # Check if the data information have been computed assert_equal(ros.min_c_, 0) assert_equal(ros.maj_c_, 1) assert_equal(ros.stats_c_[0], 500) assert_equal(ros.stats_c_[1], 4500)
def test_ros_fit(): """Test the fitting method""" # Create the object ros = RandomOverSampler(random_state=RND_SEED) # Fit the data ros.fit(X, Y) # Check if the data information have been computed assert_equal(ros.min_c_, 0) assert_equal(ros.maj_c_, 1) assert_equal(ros.stats_c_[0], 3) assert_equal(ros.stats_c_[1], 7)
def selection(**kwargs): df = kwargs['ti'].xcom_pull(task_ids='data_preprocessing', key='df') for col in df.columns: #Label Encoding if df[col].dtypes == 'object': encoder = LabelEncoder() df[col] = encoder.fit_transform(df[col]) X = df.drop('income', axis=1) Y = df['income'] selector = ExtraTreesClassifier(random_state=42) selector.fit(X, Y) feature_imp = selector.feature_importances_ for index, val in enumerate(feature_imp): print(index, round((val * 100), 2)) X = X.drop([ 'workclass', 'education', 'race', 'gender', 'capital-loss', 'native-country' ], axis=1) for col in X.columns: scaler = StandardScaler() X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1)) round(Y.value_counts(normalize=True) * 100, 2).astype('str') + ' %' ros = RandomOverSampler(random_state=42) ros.fit(X, Y) X_resampled, Y_resampled = ros.fit_resample(X, Y) print( round(Y_resampled.value_counts(normalize=True) * 100, 2).astype('str') + ' %') X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("Y_train shape:", Y_train.shape) print("Y_test shape:", Y_test.shape) kwargs['ti'].xcom_push(key='X_train', value=X_train) kwargs['ti'].xcom_push(key='X_test', value=X_test) kwargs['ti'].xcom_push(key='Y_train', value=Y_train) kwargs['ti'].xcom_push(key='Y_test', value=Y_test)
def __init__(self, data_pool, parameters, training): self.data_pool = data_pool self.parameters = parameters self.batch_size = parameters['batch_size'] self.training = training # Training is defined as the boolean flag of whether the data is for training or test # During training, the data is sampled from a pool # During test, the data is sampled sequentially, and exhaustively. # A vector needs to be given whether the data is padding data at the end of the dataset # A return state needs to be given to state if all test data is given. self.categorical = True self.d_thresh_range = None self.val_minibatch_idx = 0 self.d_thresh = None self.reduced_pool = None self.distance_pool_cache = {} self.input_mask = pd.Series([ np.tile(self.parameters['input_mask'], (self.parameters['observation_steps'], 1)) for x in range(self.batch_size) ], dtype=object, index=([0] * self.batch_size)) # Generate balanced index list ros = RandomOverSampler() if 'relative' in self.parameters['ibeo_data_columns'][0]: selection_data = list(data_pool.relative_destination.values) else: selection_data = list(data_pool.track_class.values) le = preprocessing.LabelEncoder() le.fit(selection_data) indexed_classes = np.array(le.transform(selection_data)) ros.fit(np.expand_dims(range(len(indexed_classes)), 1), indexed_classes) balanced_idxs, balanced_classes = ros.sample( np.expand_dims(range(len(indexed_classes)), 1), indexed_classes) self.balanced_idxs = np.squeeze(balanced_idxs) # bf = data_pool.iloc[balanced_idxs] # class_dict = {} # for class_t in data_pool.track_class.unique(): # class_dict[class_t] = len(bf[bf.track_class==class_t])/float(len(bf)) return
def execute(trainfile, sampler): print("--- Executing") print("Using trainfile: ", trainfile) print("--- Loading (transformed) data") data = Data.Data() train_df = data.load(trainfile) y = train_df["is_attributed"] X = train_df.drop(["is_attributed"], axis=1) columns = X.columns.values before_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y))) print("Original weights: ", before_class_weight) X_resampled = None y_resampled = None if sampler == "RANDOM": oversampler = RandomOverSampler(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "ADASYN": oversampler = ADASYN(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) elif sampler == "SMOTE": oversampler = SMOTE(random_state=0) oversampler.fit(X, y) X_resampled, y_resampled = oversampler.sample(X, y) else: print("Invalid sampler: ", sampler) after_class_weight = dict( zip([0, 1], compute_class_weight('balanced', [0, 1], y_resampled))) print("Sampler: ", sampler, ", weights: ", after_class_weight) X_resampled = X_resampled.astype(int) y_resampled = y_resampled.astype(int) # print("X_resampled: ", X_resampled) # print("y_resampled: ", y_resampled) df = pd.DataFrame(data=X_resampled, columns=columns) df["is_attributed"] = y_resampled # df["is_attributed"] = df["is_attributed"].astype(int) compressor = "blosc" outfilename = trainfile + "." + sampler print("Output file (over-sampled): ", outfilename) df.to_hdf(outfilename, "table", mode="w", append=True, complevel=9, complib=compressor)
import pickle import os import pandas as pd from imblearn.over_sampling import RandomOverSampler with open(os.path.abspath('data')+'/google_play_review.pickle','rb') as f: df = pickle.load(f) ros = RandomOverSampler(random_state=666) ros.fit(df[['reviews','replies']],df[['ratings']]) X,y = ros.fit_sample(df[['reviews','replies']],df[['ratings']]) df = pd.DataFrame(X,columns= ['reviews','replies']) train_size = int(len(df)*0.9) with open(os.path.abspath('data')+'/train.txt','w') as f: for index, row in df[:train_size].iterrows(): if isinstance(row['reviews'],float): continue f.write(row['reviews'].replace('\r\n',' ')) f.write('\n') f.write(row['replies'].replace('\r\n',' ')) f.write('\n') with open(os.path.abspath('data')+'/train.reviews.txt','w') as f: for index, row in df[:train_size].iterrows(): if isinstance(row['reviews'],float): continue f.write(row['reviews'].replace('\r\n',' '))
y = train.target train.drop(["target", "id"], axis=1, inplace=True) test.drop("id", axis=1, inplace=True) #train test and validation split x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.1) x_train, x_val, y_train, y_val = train_test_split(train, y, test_size=0.3) print("TRAIN : ", x_train.shape , " and ", y_train.shape) print("TEST : ", x_test.shape, " and ", y_test.shape) print("VALIDATION : ", x_val.shape, " and ", y_val.shape) print("MAIN TO PREDICT ", test.shape) #Random Oversampling ros = RandomOverSampler(random_state=0) ros.fit(x_train, y_train) X_resampledo, y_resampledo = ros.fit_sample(x_train, y_train) print(X_resampledo.shape, y_resampledo.shape) #model_selection catboost_pool = Pool(X_resampledo, y_resampledo) cat_model = CatBoostClassifier(task_type='CPU', iterations=20000, learning_rate=0.03, early_stopping_rounds=5) cat_model.fit(X_resampledo, y_resampledo, verbose=True, plot=False, eval_set=(x_val, y_val),) #accuracy on test categories print(cat_model.score(x_test,y_test)) #metrics and score y_pred = cat_model.predict(x_test) print("ACCURACY SCORE : ", accuracy_score(y_test, y_pred)) print("MAE : ",mean_absolute_error(y_test, y_pred))
X = X.drop([ 'workclass', 'education', 'race', 'sex', 'capital.loss', 'native.country', 'fnlwgt', 'relationship', 'capital.gain' ], axis=1) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=42) ros.fit(X, Y) X_resampled, Y_resampled = ros.fit_resample(X, Y) X = X_resampled Y = Y_resampled from sklearn.ensemble import RandomForestClassifier ran_for = RandomForestClassifier(max_depth=102, n_estimators=40, random_state=42) ran_for.fit(X, Y) pickle.dump(ran_for, open('model.pkl', 'wb'))
def main(data='ann_dataset.csv', headless=False): dataset = pd.read_csv(data) oversample = True try: dataset.to_csv(("metrics/dataset.csv"), index=False) except: print("error: unable to write to file") inputs = (len(dataset.columns) - 1) #print(dataset.info()) X = dataset.iloc[:, 0:-2].values y = dataset.iloc[:, -1].values #random resampling to reduce effect of minority class size # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from imblearn.over_sampling import RandomOverSampler if (oversample): #oversampling from training set ros = RandomOverSampler(random_state=0) ros.fit(X_train, y_train) X_train, y_train = ros.fit_resample(X_train, y_train) #oversampling for test set ros.fit(X_test, y_test) X_test, y_test = ros.fit_resample(X_test, y_test) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # ------- Part-2: Build the ANN -------- # import keras library and packages from keras.models import Sequential from keras.layers import Dense import livelossplot #creating the classifier and setting the layers... classifier = Sequential() classifier.add(Dense(inputs, activation='relu')) classifier.add(Dense(inputs, activation='relu')) classifier.add(Dense(math.floor(inputs), activation='sigmoid')) classifier.add(Dense(1, activation='sigmoid')) classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Fitting the ANN to the training set num_epochs = 10 batch = 100 if (headless == False): classifier.fit(X_train, y_train, batch_size=batch, epochs=num_epochs, callbacks=[livelossplot.PlotLossesKeras()], verbose=1, validation_data=(X_test, y_test)) else: classifier.fit(X_train, y_train, batch_size=batch, epochs=num_epochs, verbose=1, validation_data=(X_test, y_test)) y_pred = classifier.predict(X_test) # Predicting the Test set results score = classifier.evaluate(X_test, y_test, verbose=1) print('Test loss:', score[0]) print('Test accuracy:', score[1]) print("Classifier Summary") classifier.summary() #print("Y_test, Y_pred") # Making the confusion Matrix from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay cm = confusion_matrix(y_test, y_pred.round()) disp = ConfusionMatrixDisplay(confusion_matrix=cm) #write confusion matrix to file #cv2.imwrite("metrics/cm.img", cv2.imwrite(disp.plot())) if (headless == False): disp.plot() print("Confusion Matrix:") print(cm) tn = float(cm[0, 0]) fp = float(cm[0, 1]) tp = float(cm[1, 1]) fn = float(cm[1, 0]) precision = tp / (tp + fp) recall = tp / (tp + fn) sensitivity = tp / (tp + fn) specificity = tn / (tn + fp) f1 = 2 * (precision * recall) / (precision + recall) print("Precision: " + str(precision * 100) + "%") print("Recall: " + str(recall * 100) + "%") print("Sensitivity: " + str(sensitivity * 100) + "%") print("Specificity: " + str(specificity * 100) + "%") print("F1 Score: " + str(f1))
hist_iphone_3v = px.histogram(iphone_cor_3v, x="iphonesentiment") plot(hist_iphone_3v) galaxy_cor_3v = galaxy_corr galaxy_cor_3v['galaxysentiment'] = galaxy_cor_3v['galaxysentiment'].map(mapper) galaxy_cor_3v['galaxysentiment'] = pd.Series(galaxy_cor_3v['galaxysentiment'], dtype="category") galaxy_cor_3v.dtypes galaxy_cor_3v['galaxysentiment'].unique() hist_galaxy_3v = px.histogram(galaxy_cor_3v, x="galaxysentiment") plot(hist_galaxy_3v) ### Over sampling # Random over sampler ros = RandomOverSampler(random_state=0) ros.fit(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled, isent_resampled = ros.sample(iphone_corr.iloc[:, 0:46], iphone_corr['iphonesentiment']) iphone_resampled_complete = pd.DataFrame(iphone_resampled) iphone_resampled_complete['iphonesentiment'] = isent_resampled hist_iphone_resampled = px.histogram(iphone_resampled_complete, x='iphonesentiment') plot(hist_iphone_resampled) ros.fit(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled, gsent_resampled = ros.sample(galaxy_corr.iloc[:, 0:45], galaxy_corr['galaxysentiment']) galaxy_resampled_complete = pd.DataFrame(galaxy_resampled) galaxy_resampled_complete['galaxysentiment'] = gsent_resampled hist_galaxy_resampled = px.histogram(galaxy_resampled_complete, x='galaxysentiment')