def get_data(fillna=True): aps_train = pd.read_csv('./aps_failure_training_set.csv', skiprows=20, keep_default_na=False) aps_test = pd.read_csv('./aps_failure_test_set.csv', skiprows=20, keep_default_na=False) aps_train.replace('na', np.nan, inplace=True) aps_test.replace('na', np.nan, inplace=True) columns_to_remove = [ 'br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000', 'bm_000', 'cd_000' ] aps_train = aps_train.drop(columns=columns_to_remove) aps_test = aps_test.drop(columns=columns_to_remove) if fillna: aps_train = aps_train.fillna(-1) aps_test = aps_test.fillna(-1) X, y = split_dataset(aps_train, CLASS) X_test, y_test = split_dataset(aps_test, CLASS) X = X.astype('float64') X_test = X_test.astype('float64') if not fillna: X = X.apply(lambda x: x.fillna(x.median())) X_test = X_test.apply(lambda x: x.fillna(x.median())) y = y.map({'pos': 1, 'neg': 0}) y_test = y_test.map({'pos': 1, 'neg': 0}) return X, X_test, y, y_test
keep_default_na=False, na_values='na') orig_test = pd.read_csv('./aps_failure_test_set.csv', skiprows=20, keep_default_na=False, na_values='na') """ Let's first analyze the baseline we are working with, we are going to evaluate Naive Bayes, Knn and Random Forest with no preprocessing and checck if the results between them are similar or not. """ X_train, y_train = split_dataset(orig_train, CLASS) X_test, y_test = split_dataset(orig_test, CLASS) X_train, X_test = X_train.fillna(0), X_test.fillna(0) y_train = y_train.map({'pos': 1, 'neg': 0}) y_test = y_test.map({'pos': 1, 'neg': 0}) balanced = False print('Balanced data: {}'.format(collections.Counter(y_train))) def get_data(X_train=X_train, X_test=X_test): global y_train global balanced high_corr = get_high_correlated_cols() X_train = X_train.drop(columns=high_corr) X_test = X_test.drop(columns=high_corr)
schiller_data = pd.read_csv('../schiller.csv') data = [[green_data, 'green_data'], [hinselmann_data, 'hinselmann_data'], [schiller_data, 'schiller_data']] green_data['hinselmann'] = 0 green_data['schiller'] = 0 hinselmann_data['hinselmann'] = 1 hinselmann_data['schiller'] = 0 schiller_data['hinselmann'] = 0 schiller_data['schiller'] = 1 super_table = green_data.append(hinselmann_data) super_table = super_table.append(schiller_data) X, y = split_dataset(super_table, CLASS) X = getKBest(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) sm = SMOTE(random_state=2) X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) results = {} for clf in base_clfs: clf_name = type(clf).__name__
def explore(): sns.set() sns.set_style("darkgrid") sns.despine() """ Problem description: Minimize the costs of repairing APS. Try to predict when a failure is going to occur. Predicting a failure when it's not cost: 10 Missing a failure cost: 500 (this is available in the txt that comes with the dataset) """ aps_train = pd.read_csv('./aps_failure_training_set.csv', skiprows=20, keep_default_na=False) aps_test = pd.read_csv('./aps_failure_test_set.csv', skiprows=20, keep_default_na=False) """ Problem: Missing values are represented as 'na' need to replace those with nulls Explore: Different strategies of replacing the nans values and comparing the results """ aps_train.replace('na', np.nan, inplace=True) aps_test.replace('na', np.nan, inplace=True) """ Problem: Data balancing, the classes distribution is not balanced this is normal since this is a failure thing and it's normal that it doesn't fail as much as it works """ print(aps_train[CLASS].value_counts()) print(aps_train[CLASS].value_counts()) """ train_hist = sns.countplot(x=CLASS, data=aps_train) train_hist.set(xlabel='Failure', ylabel='Count') #plt.savefig('images/unbalanced_train.pdf') plt.clf() test_hist = sns.countplot(x=CLASS, data=aps_test) test_hist.set(xlabel='Failure', ylabel='Count') #plt.savefig('images/unbalanced_test.pdf') plt.clf() """ X_train, y_train = split_dataset(aps_train, CLASS) X_test, y_test = split_dataset(aps_train, CLASS) y_train = y_train.map({'pos': 1, 'neg': 0}) y_test = y_test.map({'pos': 1, 'neg': 0}) """ Good thing: no class attributes, no need for dummy transformations only on the class itself """ X_train = X_train.astype('float64') X_test = X_test.astype('float64') """ Data exploration: There are 170 columns, all of them are numeric values. talk about training and test dataset instances There is no information about the attributes real meaning because they are anonymized for proprietary issues. We can't inffer attributes because of this. There are attributes that correspond to intervals, "bins", discretize (?) this attributes. """ """ Problem: Missing data. """ print('Number of rows after removing training missing values: {}'.format( aps_train.dropna().shape[0])) print('Number of rows after removing test missing values: {}'.format( aps_test.dropna().shape[0])) """ the number of instances available after removing the missing values is very low comparing to the original dataset. Try to explore the dataset to remove columns that have to many missing values or values that don't vary accross instances I think it makes more sense to work with both datasets concatenated, so we can see the overall thing. Check if instances are 'pos' before removing them. """ aps = pd.concat([aps_train, aps_test]) X, y = split_dataset(aps, CLASS) X = X.astype('float64') num_instances = X.shape[0] cols_missing = {} no_std_dev = [] for col in X: #print('Analyzing col {}'.format(col)) #print() std_dev = standard_deviation(X[col]) #print('Standard deviation of attribute: {}'.format(std_dev)) if std_dev == 0: no_std_dev.append(col) num_missing = X[col].isna().sum() percentage_missing = (num_missing / num_instances) * 100 cols_missing[col] = percentage_missing #print('Percentage of missing: {}'.format(percentage_missing)) #print() #print() ATTRIBUTE = 'Attribute' PERCENTAGE = 'Percentage' missing_df = pd.DataFrame(list(cols_missing.items())) missing_df.columns = [ATTRIBUTE, PERCENTAGE] missing_df = missing_df.sort_values([PERCENTAGE], ascending=False).reset_index(drop=True) # filter by high percentage missing_df = missing_df.loc[missing_df[PERCENTAGE] > 30] """ plt.figure(figsize=(missing_df.shape[1] + 5, 10)) ax = sns.barplot(missing_df.index, missing_df[PERCENTAGE], color="steelblue") ax.set(xlabel=ATTRIBUTE, ylabel=PERCENTAGE) ax.set_xticklabels(missing_df[ATTRIBUTE]) for item in ax.get_xticklabels(): item.set_rotation(45) plt.savefig('images/missing_values.pdf') plt.clf() """ """ There is a column, 'cd_000', which has no variance in it's values, this is, the values are all the same so it isn't representative of the data or something like that. Drop this column """ high_percentage_missing = missing_df.loc[ missing_df[PERCENTAGE] > 60][ATTRIBUTE].values print(high_percentage_missing) """ There are 7 columns with a missing percentage of over 70% Consider to drop this columns because they have too many different missing values. With this parameters there are 26535 rows left, it's better. """ """ Before removing the attributes one should analyse if the attribute are important to determine the class. "discriminative (supervised learning) or informative (unsupervised learning)" """ """ Analyze distributions """ """ sns.pairplot(aps, hue=CLASS) plt.savefig('images/pair_plot.pdf') plt.clf() plt.figure(figsize=(140,140)) sns.heatmap(data=X.corr()) plt.savefig('images/correlations.pdf') plt.clf() """ """
keep_default_na=False) aps_test = pd.read_csv('./aps_failure_test_set.csv', skiprows=20, keep_default_na=False) aps_train.replace('na', np.nan, inplace=True) aps_test.replace('na', np.nan, inplace=True) #drop NaN values #problema -> passamos de 60000 rows para apenas 591 aps_train = aps_train.dropna() aps_test = aps_test.dropna() X_train, y_train = split_dataset(aps_train, 'class') X_test, y_test = split_dataset(aps_train, 'class') y_train = y_train.map({'pos': 1, 'neg': 0}) y_test = y_test.map({'pos': 1, 'neg': 0}) X_train = X_train.astype('float64') X_test = X_test.astype('float64') neigh = KNeighborsClassifier(n_neighbors=10) res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) pprint(res) """ k=2