def main_function(data_frame): get_details(data_frame) print("Class count\n", data_frame.groupby(SECOND_LEVEL_TARGET).size()) # Impute missing values data_frame = impute_missing_values(data_frame, "most_frequent") print(data_frame.head(20)) print(data_frame.isnull().sum().sum()) # Get the correlation matrix # get_feature_correlations(data_frame, plot=True, return_resulst=True) # Check if duplicate records exist is_duplicated = check_duplicates(data_frame) # Drop duplicate records if exist if is_duplicated: data_frame.drop_duplicates(inplace=True) print("Dropped duplicate records. Size after dropping duplicates: ", data_frame.shape) # One Hot Encoding columns_to_encode = [ 'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal', 'small-intestine' ] data_frame = perform_one_hot_encoding(data_frame, columns_to_encode) # Pre-prcoessed dataset pre_processed_data = data_frame # Top Level Classifier - classify by region classify_by_region(pre_processed_data) # Create balanced datasets for the second level # create_separate_datasets(pre_processed_data) # # # upper_region_classifier() # # thoracic_region_classifier() # ip_region_classifier() # ep_region_classifier()
print(data_frame2.shape) # Check for equal distributions figsize(8, 8) # Density plot of the final predictions and the test values sns.kdeplot(data_frame['brain'], label='original') sns.kdeplot(data_frame2['brain'], label='Synthetic') # Label the plot plt.xlabel('Primary Tumor Sites') plt.ylabel('Density') plt.title('Test Values and Predictions') plt.show() ######################################################### EDA ######################################################## print("\n\n!!!!!!!!!!!!!!!!!!!!!!! EDA !!!!!!!!!!!!!!!!!!!!!!!!\n") get_details(data_frame) # visualize_class_distribution(data_frame, "class") # visualise_feature_distribution(data_frame) is_duplicated = check_duplicates(data_frame) ################################################### Data Preprocessing ############################################### print( "\n\n!!!!!!!!!!!!!!!!!!!!!!! DATA PREPROCESSING !!!!!!!!!!!!!!!!!!!!!!!!\n" ) # Impute missing values data_frame = impute_missing_values(data_frame, "most_frequent") # Drop duplicate records if exist if is_duplicated: data_frame.drop_duplicates(inplace=True)
def classify_by_region(data_frame): get_details(data_frame) print("Before Oversampling By Region\n", data_frame.groupby('region').size()) # sns.countplot(data_frame['region'], label="Count") # plt.show() # sns.heatmap(data_frame.drop('region', axis=1), cmap='cool', annot=True) # plt.show() # get_feature_correlations(data_frame, plot=True, return_resulst=False) X = data_frame.drop(['region', 'class'], axis=1) # Features - drop class from features - 'age', 'sex', y = data_frame['region'] # Labels mutual_info = mutual_info_classif(X, y, discrete_features='auto') print("mutual_info: ", mutual_info) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True) # X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True) sm = BorderlineSMOTE() X_resampled, y_resampled = sm.fit_sample(X_train, y_train) print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size()) # X_resampled.to_csv('resources/data/X_resampled.csv', index=False) # y_resampled.to_csv('resources/data/y_resampled.csv', header=['region'], index=False) ############################################################################### # 4. Scale data # ############################################################################### # sc = StandardScaler() # X_resampled = sc.fit_transform(X_resampled) # X_test = sc.transform(X_test) # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/ # categorical feature selection # sf = SelectKBest(chi2, k='all') # sf_fit = sf.fit(X_train, y_train) # # print feature scores # for i in range(len(sf_fit.scores_)): # print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i])) # # # plot the scores # datset = pd.DataFrame() # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))] # datset['scores'] = sf_fit.scores_ # datset = datset.sort_values(by='scores', ascending=True) # sns.barplot(datset['scores'], datset['feature'], color='blue') # sns.set_style('whitegrid') # plt.ylabel('Categorical Feature', fontsize=18) # plt.xlabel('Score', fontsize=18) # # plt.show() # sel_chi2 = SelectKBest(chi2, k='all') # chi 10 - 0.64, 0.63, 0.60 X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) X_test_chi2 = sel_chi2.transform(X_test) # Spot Check Algorithms # spot_check_algorithms(X_resampled, y_resampled) # models = [SVC(kernel='poly'), RandomForestClassifier(), GradientBoostingClassifier()] # for i in range(len(models)): # # Get the final model # parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 # # # Train the final model # parent_model.fit(X_resampled, y_resampled) # # # Evaluate the final model on the training set # predictions = parent_model.predict(X_resampled) # print_evaluation_results(y_resampled, predictions) # # # Evaluate the final model on the test set # predictions = parent_model.predict(X_test) # print_evaluation_results(y_test, predictions, train=False) # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42)) pipeline = Pipeline( [ # ('selector', SelectKBest(f_classif)), ('model', RandomForestClassifier(n_jobs = -1) ) ] ) # Perform grid search on the classifier using f1 score as the scoring method grid_obj = GridSearchCV( estimator= GradientBoostingClassifier(), param_grid={ # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'n_estimators': [10, 20, 30], 'max_depth': [6, 10, 20, 30], # 'max_depth': [1, 10, 20, 30], 'min_samples_split': [1, 10, 100] # 'model__n_estimators': np.arange(10, 200, 10) # 'C': [1, 10, 100] }, n_jobs=-1, scoring="f1_micro", cv=5, verbose=3 ) # Fit the grid search object to the training data and find the optimal parameters grid_fit = grid_obj.fit(X_resampled, y_resampled) # Get the best estimator best_clf = grid_fit.best_estimator_ print(best_clf) # Get the final model parent_model = best_clf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68 MLP wid fs - 0.65, 0.69, 0.70, GB - 0.67, without fs 0.62, 0.61, DT - 0.58, RF - 0.67, multi_LR - wid fs 0.64 , voting - 0.60 t0 = time() # Train the final model parent_model.fit(X_resampled, y_resampled) print("training time:", round(time() - t0, 3), "s") # Evaluate the final model on the training set train_predictions = parent_model.predict(X_resampled) print_evaluation_results(y_resampled, train_predictions) t0 = time() # Evaluate the final model on the test set test_predictions = parent_model.predict(X_test) print("predicting time:", round(time() - t0, 3), "s") print_evaluation_results(y_test, test_predictions, train=False) confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)
def thoracic_region_classifier(): data_frame = pd.read_csv("../resources/datasets/thoracic_region.csv", na_values='?', dtype='category') data_frame.drop('region', axis=1, inplace=True) get_details(data_frame) # make_boolean(data_frame) print("Before Oversampling By Class\n", data_frame.groupby('class').size()) # sns.countplot(data_frame['class'], label="Count") # plt.show() features = data_frame.drop(['class'], axis=1) labels = data_frame['class'] # Labels - 1, 22 X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True) # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False) # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False) # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False) # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False) print("X_train2 ", pd.DataFrame(X_train).shape) print("X_train2 ", pd.DataFrame(y_train).shape) # smote = BorderlineSMOTE() smote = RandomOverSampler( ) # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases X_resampled2, y_resampled2 = smote.fit_sample(X_train, y_train) # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False) # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False) print("X_train2 ", pd.DataFrame(X_resampled2).shape) print("X_train2 ", pd.DataFrame(y_resampled2).shape) df = pd.DataFrame(y_resampled2) print(df.groupby('class').size()) # sel_chi2 = SelectKBest(chi2, k=8) # select 8 features # X_train_chi2 = sel_chi2.fit_transform(X_resampled2, y_resampled2) # print(sel_chi2.get_support()) # # X_test_chi2 = sel_chi2.transform(X_test) # print(X_test.shape) # print(X_test_chi2.shape) # # ############################################################################### # # # 4. Scale data # # # ############################################################################### # sc = StandardScaler() # X_resampled2 = sc.fit_transform(X_resampled2) # X_test = sc.transform(X_test) # Spot Check Algorithms # spot_check_algorithms(X_train_chi2, y_resampled2) # Make predictions on validation dataset using the selected model thoracic_model = DecisionTreeClassifier( ) # MLP- 0.88, ExtraTreeClassifier-0.73,0.97,0.94,0.91,0.94, 0.94, 0.86 RF- 0.88, 0.88 GB- 0.89, 0.89 LR()- 0.88, 0.88 LogisticRegression(solver='liblinear', multi_class='ovr') - 0.92, kNN- 0.87, 0.92, 0.84 DT- 0.94, 0.94, 0.89, 0.94 SVC(gamma='auto') - 0.94, MultinomialNB() - 0.88 # models2 = VotingClassifier( # estimators=[('rf', random_forest), ('knn', KNeighborsClassifier(n_neighbors=5)), ('NB', GaussianNB())], # voting='hard') # 0.74 # Train the final model thoracic_model = thoracic_model.fit(X_resampled2, y_resampled2) # Evaluate the final model on the training set predictions = thoracic_model.predict(X_resampled2) print_evaluation_results(y_resampled2, predictions) # Evaluate the final model on the test set predictions = thoracic_model.predict(X_test) print_evaluation_results(y_test, predictions, train=False) joblib.dump(thoracic_model, filename='../resources/models/sub_classifier_2.pkl')
def extra_peritoneum_region_classifier(): data_frame = pd.read_csv( "../resources/datasets/extra_peritoneum_region.csv", na_values='?', dtype='category') data_frame.drop('region', axis=1, inplace=True) get_details(data_frame) print("Before Oversampling By Class\n", data_frame.groupby('class').size()) # make_boolean(data_frame) # sns.countplot(data_frame['class'], label="Count") # plt.show() features = data_frame.drop(['class'], axis=1) labels = data_frame['class'] # Labels - 8, 14, 15, 16, 17, 18, 19, 20, 21 # pca = decomposition.PCA(n_components=9) # pca.fit(features) # features = pca.transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True) # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False) # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False) # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False) # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False) print("X_train2 ", pd.DataFrame(X_train).shape) print("X_train2 ", pd.DataFrame(y_train).shape) # smote = RandomOverSampler() # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases # X_resampled2, y_resampled2 = smote.fit_sample(X_train2, y_train2) # # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2) # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False) # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False) X_resampled2 = None y_resampled2 = None smote = RandomOverSampler() # for i in range(4): # X_resampled2, y_resampled2 = smote.fit_resample(X_train2, y_train2) # X_train2 = X_resampled2 # y_train2 = y_resampled2 # print("X_train2 ", pd.DataFrame(X_resampled2).shape) # print("X_train2 ", pd.DataFrame(y_resampled2).shape) X_resampled, y_resampled = smote.fit_resample(X_train, y_train) # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False) # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False) # print("X_train2 ", pd.DataFrame(X_resampled2).shape) # print("X_train2 ", pd.DataFrame(y_resampled2).shape) df = pd.DataFrame(y_resampled) print(df.groupby('class').size()) sel_chi2 = SelectKBest(chi2, k=8) # select 8 features X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) print(sel_chi2.get_support()) X_test_chi2 = sel_chi2.transform(X_test) print(X_test.shape) print(X_test_chi2.shape) # # ############################################################################### # # # 4. Scale data # # # ############################################################################### # sc = StandardScaler() # X_resampled2 = sc.fit_transform(X_resampled2) # X_test2 = sc.transform(X_test2) estimators = [('rf', RandomForestClassifier(random_state=42)), ('svr', make_pipeline(StandardScaler(), KNeighborsClassifier()))] # clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(multi_class='ovr')) clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(multi_class='ovr')) # Spot Check Algorithms spot_check_algorithms(X_train_chi2, y_resampled) # Make predictions on validation dataset using the selected model # model2 = OneVsRestClassifier(GaussianNB()) # 0.43 # model2 = DecisionTreeClassifier() # 0.78, 0.86, 0.72 # model2 = RandomForestClassifier() # 0.80, 0.74, 0.69, 0.65, 0.66 # model2 = GradientBoostingClassifier() # # model2 = VotingClassifier(estimators=[('rf', RandomForestClassifier()), ('mlp', MLPClassifier()), (('NB', GaussianNB()))], voting='hard') # 0.51 extra_peritoneum_model = OneVsRestClassifier( RandomForestClassifier() ) # -0.48, wid 8 features - 0.48, 0.52, 0.53, 0.54, 0.57, wid * f - 0.51, clf - 0.48, kNN - 0.49 # Train the final model extra_peritoneum_model = extra_peritoneum_model.fit( X_train_chi2, y_resampled) # Evaluate the final model on the training set predictions = extra_peritoneum_model.predict(X_train_chi2) print_evaluation_results(y_resampled, predictions) # Evaluate the final model on the test set predictions = extra_peritoneum_model.predict(X_test_chi2) print_evaluation_results(y_test, predictions, train=False) joblib.dump(extra_peritoneum_model, filename='../resources/models/sub_classifier_4.pkl')
def upper_region_classifier(): # Read in data data_frame = pd.read_csv("../resources/datasets/upper_region.csv", na_values='?', dtype='category') data_frame.drop('region', axis=1, inplace=True) get_details(data_frame) print("Before Oversampling By Class\n", data_frame.groupby('class').size()) # sns.countplot(data_frame['class'], label="Count") # plt.show() features = data_frame.drop(['class'], axis=1) labels = data_frame['class'] # Labels - 2, 4, 10 X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True) print("X_train2 ", pd.DataFrame(X_train).shape) print("X_train2 ", pd.DataFrame(y_train).shape) ros = RandomOverSampler( ) # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases X_resampled, y_resampled = ros.fit_sample(X_train, y_train) print("X_train2 ", pd.DataFrame(X_resampled).shape) print("X_train2 ", pd.DataFrame(y_resampled).shape) df = pd.DataFrame(y_resampled) print(df.groupby('class').size()) # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/ # categorical feature selection # sf = SelectKBest(chi2, k='all') # sf_fit = sf.fit(X_train, y_train) # # print feature scores # for i in range(len(sf_fit.scores_)): # print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i])) # # # plot the scores # datset = pd.DataFrame() # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))] # datset['scores'] = sf_fit.scores_ # datset = datset.sort_values(by='scores', ascending=True) # sns.barplot(datset['scores'], datset['feature'], color='blue') # sns.set_style('whitegrid') # plt.ylabel('Categorical Feature', fontsize=18) # plt.xlabel('Score', fontsize=18) # plt.show() # sel_chi2 = SelectKBest(chi2, k=9) # DT chi 9- 0.83*2, 10- 91,92 # # RF chi 9- 1,1,91,1,91 # # GB chi 9- 91,91,82,91 # X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) # X_test_chi2 = sel_chi2.transform(X_test) # # ############################################################################### # # # 4. Scale data # # # ############################################################################### # sc = StandardScaler() # X_resampled2 = sc.fit_transform(X_resampled2) # X_test2 = sc.transform(X_test2) # get_baseline_performance(X_resampled, y_resampled, X_test, y_test) # Spot Check Algorithms # spot_check_algorithms(X_resampled, y_resampled) # Make predictions on validation dataset using the selected model # upper_region_model = MLP # DT()- 0.92*4, RF()-0.91, RF(n_estimators=200) - 0.91, 1.0, # kNN - 0.91, knn(neigh-3) - 0.78, knn(neigh-5) - 0.91, DT - 0.79, 0.91, 0.92, SVC(gamma='auto') - 0.91, LogisticRegression(solver='liblinear', multi_class='ovr') - 0.85, upper_region_model = RandomForestClassifier(n_jobs=-1, max_depth=20, n_estimators=200) # define Boruta feature selection method # feat_selector = BorutaPy(upper_region_model, n_estimators='auto', verbose=2, random_state=1) # # find all relevant features - 5 features should be selected # feat_selector.fit(X_resampled, y_resampled) # # check selected features - first 5 features are selected # print(feat_selector.support_) # # check ranking of features # print(feat_selector.ranking_) # # call transform() on X to filter it down to selected features # X_filtered = feat_selector.transform(X_resampled) # Train the final model upper_region_model = upper_region_model.fit(X_resampled, y_resampled) # Evaluate the final model on the training set predictions = upper_region_model.predict(X_resampled) print_evaluation_results(y_resampled, predictions) # Evaluate the final model on the test set predictions = upper_region_model.predict(X_test) print_evaluation_results(y_test, predictions, train=False)
def intra_peritoneum_region_classifier(): data_frame = pd.read_csv( "../resources/datasets/intra_peritoneum_region.csv", na_values='?', dtype='category') data_frame.drop('region', axis=1, inplace=True) get_details(data_frame) print("Before Oversampling By Class\n", data_frame.groupby('class').size()) # sns.countplot(data_frame['class'], label="Count") # plt.show() features = data_frame.drop(['class'], axis=1) labels = data_frame['class'] # Labels - 3, 5, 6, 7, 11, 12, 13 # pca = decomposition.PCA(n_components=9) # pca.fit(features) # features = pca.transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True) # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False) # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False) # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False) # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False) print("X_train2 ", pd.DataFrame(X_train).shape) print("X_train2 ", pd.DataFrame(y_train).shape) smote = RandomOverSampler( sampling_strategy='minority' ) # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases X_resampled, y_resampled = smote.fit_sample(X_train, y_train) # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2) # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False) # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False) print("X_train2 ", pd.DataFrame(X_resampled).shape) print("X_train2 ", pd.DataFrame(y_resampled).shape) df = pd.DataFrame(y_resampled) print(df.groupby('class').size()) sel_chi2 = SelectKBest(chi2, k=8) # select 9 features X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled) print(sel_chi2.get_support()) X_test_chi2 = sel_chi2.transform(X_test) print(X_test.shape) print(X_test_chi2.shape) # # ############################################################################### # # # 4. Scale data # # # ############################################################################### # sc = StandardScaler() # X_resampled2 = sc.fit_transform(X_resampled2) # X_test2 = sc.transform(X_test2) # Spot Check Algorithms # spot_check_algorithms(X_train_chi2, y_resampled) # Make predictions on validation dataset using the selected model # intra_peritoneum_model = KNeighborsClassifier(n_neighbors=5) # kNN()- 0.39, kNN(neig-5) - 0.44, 0.39, LogisticRegression(solver='liblinear', multi_class='ovr'), wid 8, 9 features - 0.42, wid 12 featues - 0.40, SVC(gamma='auto') - 0.35, OneVsRestClassifier(GaussianNB()) - 0.05 intra_peritoneum_model = IsolationForest(n_estimators=100) # Train the final model intra_peritoneum_model = intra_peritoneum_model.fit( X_train_chi2, y_resampled) # Evaluate the final model on the training set predictions = intra_peritoneum_model.predict(X_train_chi2) print_evaluation_results(y_resampled, predictions) # Evaluate the final model on the test set predictions = intra_peritoneum_model.predict(X_test_chi2) print_evaluation_results(y_test, predictions, train=False) joblib.dump(intra_peritoneum_model, filename='../resources/models/sub_classifier_3.pkl')