Exemplo n.º 1
0
def main():
    data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv')
   
    data_train = np.asarray(data_train)
    target_train = np.array(target_train)
    target_train = target_train.astype(np.int32)
    
    print(target_train)
    
    data_train, target_train = smote.smote_data(data_train, target_train)
    
    classify(data_train, target_train, data_test, target_test)
Exemplo n.º 2
0
def load_train_data_and_split(testsize=0.3,
                              targetcol=-1,
                              file='data/processed_without_missing.csv',
                              split=True,
                              num_samples_per_class=-1,
                              num_classes=3,
                              smote=False):
    print("Loading dataset.")

    headers = []
    dataset = []
    ifile = open(file, "r")
    reader = csv.reader(ifile)
    first = True
    for row in reader:
        if first:
            first = False
            headers.append(row)
            continue
        dataset.append(row)
        for i in row:
            if not i.isdigit():
                print(row)

    ifile.close()

    inputs = [[int(y) for y in x] for x in dataset]

    outputs = []
    for row in inputs:
        outputs.append(row[targetcol])
        del row[targetcol]

    if num_samples_per_class > 0:  # we want equal subsets of each class.
        # first get the number of classes and their values.
        output_vals = set()
        for i, row in enumerate(inputs):
            output_vals.add(outputs[i])
        print("Number of classes: ", len(output_vals))

        # then delete samples that go over the 3000 limit.
        counts = [0, 0, 0]
        remove_indices = []
        for i, row in enumerate(inputs):
            counts[
                outputs[i] -
                1] += 1  # mapping from target (1,2,3) to array index (0,1,2)
            if (counts[outputs[i] - 1] > num_samples_per_class
                ):  # we exceeded the count so delete this row.
                remove_indices.append(i)

        for i in reversed(range(len(remove_indices))):
            del inputs[remove_indices[i]]
            del outputs[remove_indices[i]]
        print("Final counts: ", counts)

    print("Num rows: ", len(inputs))
    print("Done loading")

    for i in range(len(outputs)):
        outputs[i] -= 1

    if split:
        input_train, input_test, output_train, output_test = train_test_split(
            inputs, outputs, test_size=testsize, random_state=42)

        input_train = np.array(input_train)
        input_test = np.array(input_test)
        output_train = np.array(output_train)
        output_train = output_train.astype(np.int32)
        output_test = np.array(output_test)
        output_test = output_test.astype(np.int32)

        if num_classes == 2:  # convert all outputs of 3 to outputs of 2.
            output_train[output_train == 2] = 1
            output_test[output_test == 2] = 1

        if smote:
            input_train, output_train = sm.smote_data(input_train,
                                                      output_train)

        return input_train, input_test, output_train, output_test
    else:
        inputs = np.array(inputs)
        outputs = np.array(outputs)
        return inputs, outputs
Exemplo n.º 3
0
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X_train, X_test, y_train, y_test = common.load_train_data_and_split(
    file='data/processed_missing_filled_in.csv')

X_train = np.asarray(X_train)
y_train = np.array(y_train)
y_train = y_train.astype(np.int32)

X_train, y_train = smote.smote_data(X_train, y_train)

# iterate over classifiers
for name, clf in zip(names, classifiers):
    print("Fitting " + name + "...")

    predicted_test = clf.fit(X_train, y_train).predict(X_test)
    test_p = ((y_test != predicted_test).sum()) / (len(X_test)) * 100
    print("Error on test set: %d" % test_p)

    print(metrics.classification_report(y_test, predicted_test))
'''

results (f1-score)
-> rows with missing medical speciality removed
Exemplo n.º 4
0
 #   SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X_train, X_test, y_train, y_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv')

X_train = np.asarray(X_train)
y_train = np.array(y_train)
y_train = y_train.astype(np.int32)

X_train, y_train = smote.smote_data(X_train, y_train)

# iterate over classifiers
for name, clf in zip(names, classifiers):
    print("Fitting " + name + "...")
        
    predicted_test = clf.fit(X_train, y_train).predict(X_test)
    test_p = ((y_test != predicted_test).sum())/(len(X_test))*100
    print("Error on test set: %d" % test_p)
    
    print(metrics.classification_report(y_test, predicted_test))
    
'''

results (f1-score)
-> rows with missing medical speciality removed
Exemplo n.º 5
0
def load_train_data_and_split(testsize=0.3, targetcol=-1, file='data/processed_without_missing.csv', split=True, num_samples_per_class=-1, num_classes=3, smote=False):
    print("Loading dataset.")

    headers = []
    dataset = []
    ifile  = open(file, "r")
    reader = csv.reader(ifile)
    first = True
    for row in reader:
        if first:
            first = False
            headers.append(row)
            continue
        dataset.append(row)
        for i in row:
            if not i.isdigit():
                print (row)

    ifile.close()
    
    inputs = [[int(y) for y in x] for x in dataset]
    
    outputs = []
    for row in inputs:
        outputs.append(row[targetcol])
        del row[targetcol]
        
    if num_samples_per_class > 0: # we want equal subsets of each class.
        # first get the number of classes and their values.
        output_vals = set()
        for i, row in enumerate(inputs):
            output_vals.add(outputs[i])
        print("Number of classes: ", len(output_vals))
        
        # then delete samples that go over the 3000 limit.
        counts = [0, 0, 0]
        remove_indices = []
        for i, row in enumerate(inputs):
            counts[outputs[i]-1] += 1 # mapping from target (1,2,3) to array index (0,1,2)
            if (counts[outputs[i]-1] > num_samples_per_class): # we exceeded the count so delete this row.
                remove_indices.append(i)
                                
        for i in reversed(range(len(remove_indices))):
            del inputs[remove_indices[i]]
            del outputs[remove_indices[i]]
        print("Final counts: ", counts)
        
    print("Num rows: ", len(inputs))
    print("Done loading")
    
    for i in range(len(outputs)):
        outputs[i] -= 1
                
    if split:
        input_train, input_test, output_train, output_test = train_test_split(inputs, outputs, test_size=testsize, random_state=42)

        input_train = np.array(input_train)
        input_test = np.array(input_test)
        output_train = np.array(output_train)
        output_train = output_train.astype(np.int32)
        output_test = np.array(output_test)
        output_test = output_test.astype(np.int32)

        if num_classes == 2: # convert all outputs of 3 to outputs of 2.
            output_train[output_train == 2] = 1
            output_test[output_test == 2] = 1

        if smote:
            input_train, output_train = sm.smote_data(input_train, output_train)
        
        return input_train, input_test, output_train, output_test
    else:
        inputs = np.array(inputs)
        outputs = np.array(outputs)
        return inputs, outputs
Exemplo n.º 6
0
    "ID3:",
    id3(X_train_r_abalone, Y_train_r_abalone, X_test_r_abalone,
        Y_test_r_abalone))
print(
    "ANN:",
    neural_net(X_train_r_abalone,
               Y_train_r_abalone,
               X_test_r_abalone,
               Y_test_r_abalone,
               0.0001,
               sigmoidal,
               10000,
               prt=False))

print("\nAbalone smoted:")
x_abalone_new, y_abalone_new = sm.smote_data(x_abalone_r, y_abalone_r, dts, 5)
X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new, Y_test_abalone_new = separate(
    0.3, x_abalone_new, y_abalone_new, S)
print(
    "NBG:",
    nb_gaussiano(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new,
                 Y_test_abalone_new))
print(
    "KNN:",
    knn(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new,
        Y_test_abalone_new, 15))
print(
    "ID3:",
    id3(X_train_abalone_new, Y_train_abalone_new, X_test_abalone_new,
        Y_test_abalone_new))
print(