Exemplo n.º 1
0
def main_function(data_frame):
    get_details(data_frame)
    print("Class count\n", data_frame.groupby(SECOND_LEVEL_TARGET).size())

    # Impute missing values
    data_frame = impute_missing_values(data_frame, "most_frequent")
    print(data_frame.head(20))
    print(data_frame.isnull().sum().sum())

    # Get the correlation matrix
    # get_feature_correlations(data_frame, plot=True, return_resulst=True)

    # Check if duplicate records exist
    is_duplicated = check_duplicates(data_frame)
    # Drop duplicate records if exist
    if is_duplicated:
        data_frame.drop_duplicates(inplace=True)
        print("Dropped duplicate records. Size after dropping duplicates: ",
              data_frame.shape)

    # One Hot Encoding
    columns_to_encode = [
        'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura',
        'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular',
        'axillar', 'mediastinum', 'abdominal', 'small-intestine'
    ]
    data_frame = perform_one_hot_encoding(data_frame, columns_to_encode)

    # Pre-prcoessed dataset
    pre_processed_data = data_frame

    # Top Level Classifier - classify by region
    classify_by_region(pre_processed_data)

    # Create balanced datasets for the second level
    # create_separate_datasets(pre_processed_data)
    # #
    # upper_region_classifier()
    #
    # thoracic_region_classifier()
    #
    ip_region_classifier()
    #
    ep_region_classifier()
Exemplo n.º 2
0
plt.show()

#########################################################  EDA  ########################################################
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!  EDA  !!!!!!!!!!!!!!!!!!!!!!!!\n")
get_details(data_frame)
# visualize_class_distribution(data_frame, "class")
# visualise_feature_distribution(data_frame)
is_duplicated = check_duplicates(data_frame)

###################################################  Data Preprocessing  ###############################################
print(
    "\n\n!!!!!!!!!!!!!!!!!!!!!!!  DATA PREPROCESSING  !!!!!!!!!!!!!!!!!!!!!!!!\n"
)

# Impute missing values
data_frame = impute_missing_values(data_frame, "most_frequent")

# Drop duplicate records if exist
if is_duplicated:
    data_frame.drop_duplicates(inplace=True)
    print("Dropped duplicate records. Size after dropping duplicates: ",
          data_frame.shape)

print("Combining original dataset with synthetic samples")
data_frame = pd.concat([data_frame, data_frame2])
get_details(data_frame)

# One Hot Encoding
columns_to_encode = [
    'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura',
    'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular',