def main(): data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') data_train = np.asarray(data_train) target_train = np.array(target_train) target_train = target_train.astype(np.int32) print(target_train) data_train, target_train = smote.smote_data(data_train, target_train) classify(data_train, target_train, data_test, target_test)
def load_train_data_and_split(testsize=0.3, targetcol=-1, file='data/processed_without_missing.csv', split=True, num_samples_per_class=-1, num_classes=3, smote=False): print("Loading dataset.") headers = [] dataset = [] ifile = open(file, "r") reader = csv.reader(ifile) first = True for row in reader: if first: first = False headers.append(row) continue dataset.append(row) for i in row: if not i.isdigit(): print(row) ifile.close() inputs = [[int(y) for y in x] for x in dataset] outputs = [] for row in inputs: outputs.append(row[targetcol]) del row[targetcol] if num_samples_per_class > 0: # we want equal subsets of each class. # first get the number of classes and their values. output_vals = set() for i, row in enumerate(inputs): output_vals.add(outputs[i]) print("Number of classes: ", len(output_vals)) # then delete samples that go over the 3000 limit. counts = [0, 0, 0] remove_indices = [] for i, row in enumerate(inputs): counts[ outputs[i] - 1] += 1 # mapping from target (1,2,3) to array index (0,1,2) if (counts[outputs[i] - 1] > num_samples_per_class ): # we exceeded the count so delete this row. remove_indices.append(i) for i in reversed(range(len(remove_indices))): del inputs[remove_indices[i]] del outputs[remove_indices[i]] print("Final counts: ", counts) print("Num rows: ", len(inputs)) print("Done loading") for i in range(len(outputs)): outputs[i] -= 1 if split: input_train, input_test, output_train, output_test = train_test_split( inputs, outputs, test_size=testsize, random_state=42) input_train = np.array(input_train) input_test = np.array(input_test) output_train = np.array(output_train) output_train = output_train.astype(np.int32) output_test = np.array(output_test) output_test = output_test.astype(np.int32) if num_classes == 2: # convert all outputs of 3 to outputs of 2. output_train[output_train == 2] = 1 output_test[output_test == 2] = 1 if smote: input_train, output_train = sm.smote_data(input_train, output_train) return input_train, input_test, output_train, output_test else: inputs = np.array(inputs) outputs = np.array(outputs) return inputs, outputs
DecisionTreeClassifier(max_depth=15), RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X_train, X_test, y_train, y_test = common.load_train_data_and_split( file='data/processed_missing_filled_in.csv') X_train = np.asarray(X_train) y_train = np.array(y_train) y_train = y_train.astype(np.int32) X_train, y_train = smote.smote_data(X_train, y_train) # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") predicted_test = clf.fit(X_train, y_train).predict(X_test) test_p = ((y_test != predicted_test).sum()) / (len(X_test)) * 100 print("Error on test set: %d" % test_p) print(metrics.classification_report(y_test, predicted_test)) ''' results (f1-score) -> rows with missing medical speciality removed
# SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=15), RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X_train, X_test, y_train, y_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') X_train = np.asarray(X_train) y_train = np.array(y_train) y_train = y_train.astype(np.int32) X_train, y_train = smote.smote_data(X_train, y_train) # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") predicted_test = clf.fit(X_train, y_train).predict(X_test) test_p = ((y_test != predicted_test).sum())/(len(X_test))*100 print("Error on test set: %d" % test_p) print(metrics.classification_report(y_test, predicted_test)) ''' results (f1-score) -> rows with missing medical speciality removed
def load_train_data_and_split(testsize=0.3, targetcol=-1, file='data/processed_without_missing.csv', split=True, num_samples_per_class=-1, num_classes=3, smote=False): print("Loading dataset.") headers = [] dataset = [] ifile = open(file, "r") reader = csv.reader(ifile) first = True for row in reader: if first: first = False headers.append(row) continue dataset.append(row) for i in row: if not i.isdigit(): print (row) ifile.close() inputs = [[int(y) for y in x] for x in dataset] outputs = [] for row in inputs: outputs.append(row[targetcol]) del row[targetcol] if num_samples_per_class > 0: # we want equal subsets of each class. # first get the number of classes and their values. output_vals = set() for i, row in enumerate(inputs): output_vals.add(outputs[i]) print("Number of classes: ", len(output_vals)) # then delete samples that go over the 3000 limit. counts = [0, 0, 0] remove_indices = [] for i, row in enumerate(inputs): counts[outputs[i]-1] += 1 # mapping from target (1,2,3) to array index (0,1,2) if (counts[outputs[i]-1] > num_samples_per_class): # we exceeded the count so delete this row. remove_indices.append(i) for i in reversed(range(len(remove_indices))): del inputs[remove_indices[i]] del outputs[remove_indices[i]] print("Final counts: ", counts) print("Num rows: ", len(inputs)) print("Done loading") for i in range(len(outputs)): outputs[i] -= 1 if split: input_train, input_test, output_train, output_test = train_test_split(inputs, outputs, test_size=testsize, random_state=42) input_train = np.array(input_train) input_test = np.array(input_test) output_train = np.array(output_train) output_train = output_train.astype(np.int32) output_test = np.array(output_test) output_test = output_test.astype(np.int32) if num_classes == 2: # convert all outputs of 3 to outputs of 2. output_train[output_train == 2] = 1 output_test[output_test == 2] = 1 if smote: input_train, output_train = sm.smote_data(input_train, output_train) return input_train, input_test, output_train, output_test else: inputs = np.array(inputs) outputs = np.array(outputs) return inputs, outputs
"ID3:", id3(X_train_r_abalone, Y_train_r_abalone, X_test_r_abalone, Y_test_r_abalone)) print( "ANN:", neural_net(X_train_r_abalone, Y_train_r_abalone, X_test_r_abalone, Y_test_r_abalone, 0.0001, sigmoidal, 10000, prt=False)) print("\nAbalone smoted:") x_abalone_new, y_abalone_new = sm.smote_data(x_abalone_r, y_abalone_r, dts, 5) X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new, Y_test_abalone_new = separate( 0.3, x_abalone_new, y_abalone_new, S) print( "NBG:", nb_gaussiano(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new, Y_test_abalone_new)) print( "KNN:", knn(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new, Y_test_abalone_new, 15)) print( "ID3:", id3(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new, Y_test_abalone_new)) print(