예제 #1
0
def main():
    segmentation = Segmentation()
    segmentation.check_data()

    print("\n\n Start Segmenting Users Based on RFM Methods ... ")

    segmentation.get_rfm_metric()

    segmentation.get_rfm_index()

    segmentation.transaction.apply(segmentation.define_rfm_segment, axis=1)

    segmentation.save_rfm_segment_pie_chart()
    segmentation.save_rfm_segment_scatter_plot()

    #label the segment generated by RFM methods
    segmentation.transaction['high_growth'] = segmentation.transaction.apply(segmentation.Label_Segments, axis=1)
    user_label = segmentation.transaction[['user', 'high_growth']]

    segmentation.save_user_group_bar_chart()
    print("\n\n Finished Segmenting Users Based on RFM Methods ... ")

    #---- Now start creating features for prediction -----
    print("\n\n Start Feature Generation for Classification ... ")
    final_dataset = segmentation.create_features()
    final_dataset = final_dataset.merge(user_label)
    print("\n\n Finished Feature Generation  ... ")

    classification = Classification()
    X = final_dataset.drop(columns=['user', 'high_growth'])
    y = final_dataset['high_growth']

    #initial prediction of high growth merchant. in full feature set
    print("\n\nLogistic Regression before feature selection")
    classification.run_logistic_regression(X, y)

    #Feature selection using correlation heatmap
    classification.correlation_heatmap(final_dataset)

    selected_feature = final_dataset.drop(
        columns=['monetary', 'fall_count', 'spring_count', 'summer_count', 'winter_count', 'spring_amt', 'summer_amt',
                 'winter_amt', 'fall_amt'])
    classification.correlation_heatmap(selected_feature)

    # New prediction on selected features
    X = selected_feature.drop(columns=['user', 'high_growth'])
    y = selected_feature['high_growth']

    print("\n\nLogistic Regression after feature selection")
    classification.run_logistic_regression(X, y)

    #Now run logistic regression on using cross val with k=10
    print("\n\nLogistic Regression using Cross Validation")
    classification.run_logistic_cross_val(X, y)

    print("\n\nLogistic Regression with SMOTE Resampling")
    classification.run_logistic_regression_with_resampling(X,y)

    print("\n\nRun Decision Tree")
    classification.run_decision_tree(X, y)

    print("\n\nRun Random Forest")
    y_pred_rf = classification.run_random_forest(X, y)

    print("\n\nRun Support Vector Machine")
    classification.run_svm(X, y)

    #finally print list of High Growth Merchant
    final_dataset['predicted'] = y_pred_rf
    high_growth_merchant = final_dataset[['user', 'monetary']].loc[final_dataset['predicted'] == True]

    high_growth_merchant[['user', 'monetary']].sort_values('monetary', ascending=False).to_csv(
        'high_growth_merchant.csv', index=False)

    print(
    "HIGH GROWTH MERCHANT as Given by RANDOM FOREST: \n", high_growth_merchant[['user', 'monetary']].sort_values('monetary', ascending=False))