예제 #1
0
def test_learning(method='drop'):
    db = get_cleaned_data_from_DB(method)
    db_list = []
    target = []
    test = []
    y_test = []
    i = 0
    for line in db:
        line_list = []
        for key in line:
            if key == 'id':
                continue
            elif key == 'target':
                cls = line[key]
                if cls > 0:
                    cls = 1
                if i < 270:
                    target.append(cls)
                else:
                    y_test.append(cls)
            elif key == 'thal':
                if line[key] == 3:
                    line_list += [1.0, 0.0, 0.0]
                elif line[key] == 6:
                    line_list += [0.0, 1.0, 0.0]
                else:
                    line_list += [0.0, 0.0, 1.0]
            elif key == 'pain_type':
                pain_type = [0.0, 0.0, 0.0, 0.0]
                pain_type[int(line[key]) - 1] = 1.0
                line_list += pain_type
            elif key == 'electrocardiographic':
                el = [0.0, 0.0, 0.0, 0.0]
                el[int(line[key])] = 1.0
                line_list += el
            else:
                line_list.append(line[key])
        if i < 270:
            db_list.append(line_list)
        else:
            test.append(line_list)
        i += 1
    X = np.array(db_list)
    X = normalize(X, axis=0, norm='max')
    y = np.array(target)
    test = np.array(test)
    test = normalize(test, axis=0, norm='max')
    clf = StackClassification()
    print(X.shape, y.shape)
    clf.fit(X, y)
    print(clf.predict(test))
    for n in y_test:
        print(n, end=' ')
예제 #2
0
def learningCurve(method='drop', model_type='stack'):
    db = dbManipulation.get_cleaned_data_from_DB(method)
    db_list = []
    target = []
    for line in db:
        line_list = []
        for key in line:
            if key == 'id':
                continue
            elif key == 'target':
                cls = line[key]
                if cls > 0:
                    cls = 1
                target.append(cls)
            elif key == 'thal':
                if line[key] == 3:
                    line_list += [1.0, 0.0, 0.0]
                elif line[key] == 6:
                    line_list += [0.0, 1.0, 0.0]
                else:
                    line_list += [0.0, 0.0, 1.0]
            elif key == 'pain_type':
                pain_type = [0.0, 0.0, 0.0, 0.0]
                pain_type[int(line[key]) - 1] = 1.0
                line_list += pain_type
            elif key == 'electrocardiographic':
                el = [0.0, 0.0, 0.0, 0.0]
                el[int(line[key])] = 1.0
                line_list += el
            else:
                line_list.append(line[key])
        db_list.append(line_list)
    X = np.array(db_list)
    X = normalize(X, axis=0, norm='max')
    y = np.array(target)
    print("Start training...")
    if model_type == 'stack':
        clf = StackClassification()
    else:
        clf = LogisticRegression(C=0.01, penalty='l2')
    train_sizes, train_scores, valid_scores = learning_curve(
        clf,
        X,
        y,
        train_sizes=[
            20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260
        ],
        cv=10,
        scoring='accuracy')
    print("finish training")
    return train_sizes, np.mean(train_scores, axis=1), np.mean(valid_scores,
                                                               axis=1)
예제 #3
0
def clustering(cluster_method='kmeans'):
    db = get_cleaned_data_from_DB(method='drop')
    db_list = []
    target = []
    for line in db:
        line_list = []
        for key in line:
            if key == 'id':
                continue
            elif key == 'target':
                cls = line[key]
                if cls > 0:
                    cls = 1
                target.append(cls)
            elif key == 'thal':
                if line[key] == 3:
                    line_list += [1.0, 0.0, 0.0]
                elif line[key] == 6:
                    line_list += [0.0, 1.0, 0.0]
                else:
                    line_list += [0.0, 0.0, 1.0]
            elif key == 'pain_type':
                pain_type = [0.0, 0.0, 0.0, 0.0]
                pain_type[int(line[key]) - 1] = 1.0
                line_list += pain_type
            elif key == 'electrocardiographic':
                el = [0.0, 0.0, 0.0, 0.0]
                el[int(line[key])] = 1.0
                line_list += el
            else:
                line_list.append(line[key])
        db_list.append(line_list)
    X = np.array(db_list)
    X = normalize(X, axis=0, norm='max')
    y = np.array(target)
    pca = PCA(n_components=2)
    pca.fit(X.T)
    X = pca.components_.T
    if cluster_method == 'spectral':
        clustering = SpectralClustering(n_clusters=2,
                                        assign_labels="discretize",
                                        random_state=0).fit(X)
    else:
        clustering = KMeans(n_clusters=2, random_state=0).fit(X)
    return X, clustering.labels_, y
예제 #4
0
def training_model(method='drop', model_type='stack'):
    db = dbManipulation.get_cleaned_data_from_DB(method)
    db_list = []
    target = []
    for line in db:
        line_list = []
        for key in line:
            if key == 'id':
                continue
            elif key == 'target':
                cls = line[key]
                if cls > 0:
                    cls = 1
                target.append(cls)
            elif key == 'thal':
                if line[key] == 3:
                    line_list += [1.0, 0.0, 0.0]
                elif line[key] == 6:
                    line_list += [0.0, 1.0, 0.0]
                else:
                    line_list += [0.0, 0.0, 1.0]
            elif key == 'pain_type':
                pain_type = [0.0, 0.0, 0.0, 0.0]
                pain_type[int(line[key]) - 1] = 1.0
                line_list += pain_type
            elif key == 'electrocardiographic':
                el = [0.0, 0.0, 0.0, 0.0]
                el[int(line[key])] = 1.0
                line_list += el
            else:
                line_list.append(line[key])
        db_list.append(line_list)
    X = np.array(db_list)
    X = normalize(X, axis=0, norm='max')
    y = np.array(target)
    if model_type == 'stack':
        clf = StackClassification()
    else:
        clf = LogisticRegression(C=0.01, penalty='l2')
    clf.fit(X, y)
    return clf
예제 #5
0
def cleanInput(inputX):
    cleanData = get_cleaned_data_from_DB(method='drop')
    cleaned_List = []
    for data in cleanData:
        line_list = []
        for key in data:
            if key == 'target' or key == 'id':
                continue
            else:
                line_list.append(data[key])
        cleaned_List.append(line_list)
    cleaned_List = np.array(cleaned_List)
    mean_vals = np.mean(cleaned_List, axis=0)
    key_map = feature_map()
    input_list = []
    for i in range(1, 14):
        if i == 10 and (inputX[key_map[i]] == 0 or inputX[key_map[i]] == '?'):
            il = 1.0
        elif inputX[key_map[i]] != '?':
            il = float(inputX[key_map[i]])
        else:
            il = inputX[key_map[i]]
        input_list.append(il)
    missing_index = []
    for i in range(len(input_list)):
        if input_list[i] == '?':
            input_list[i] = mean_vals[i]
            missing_index.append(i)
    for i in missing_index:
        X = np.concatenate((cleaned_List[:, :i], cleaned_List[:, i + 1:]),
                           axis=1)
        y = cleaned_List[:, i:i + 1].reshape(1, -1)[0]
        test = np.array(input_list[:i] + input_list[i + 1:]).reshape(1, -1)
        input_list[i] = arrayKNN(X, y, test)[0]
    cleandict = {}
    for i in range(1, 14):
        cleandict[key_map[i]] = input_list[i - 1]
    line_list = []
    for key in cleandict:
        if key == 'thal':
            if cleandict[key] == 3:
                line_list += [1.0, 0.0, 0.0]
            elif cleandict[key] == 6:
                line_list += [0.0, 1.0, 0.0]
            else:
                line_list += [0.0, 0.0, 1.0]
        elif key == 'pain_type':
            pain_type = [0.0, 0.0, 0.0, 0.0]
            pain_type[int(cleandict[key]) - 1] = 1.0
            line_list += pain_type
        elif key == 'electrocardiographic':
            el = [0.0, 0.0, 0.0, 0.0]
            el[int(cleandict[key])] = 1.0
            line_list += el
        else:
            line_list.append(cleandict[key])
    InputX = np.array(line_list)
    db = get_cleaned_data_from_DB(method='drop')
    db_list = []
    target = []
    for line in db:
        line_list = []
        for key in line:
            if key == 'id':
                continue
            elif key == 'target':
                cls = line[key]
                if cls > 0:
                    cls = 1
                target.append(cls)
            elif key == 'thal':
                if line[key] == 3:
                    line_list += [1.0, 0.0, 0.0]
                elif line[key] == 6:
                    line_list += [0.0, 1.0, 0.0]
                else:
                    line_list += [0.0, 0.0, 1.0]
            elif key == 'pain_type':
                pain_type = [0.0, 0.0, 0.0, 0.0]
                pain_type[int(line[key]) - 1] = 1.0
                line_list += pain_type
            elif key == 'electrocardiographic':
                el = [0.0, 0.0, 0.0, 0.0]
                el[int(line[key])] = 1.0
                line_list += el
            else:
                line_list.append(line[key])
        db_list.append(line_list)
    X = np.array(db_list)
    concat = np.concatenate((X, InputX.reshape(1, -1)), axis=0)
    concat = normalize(concat, axis=0, norm='max')
    return concat[-1, :].reshape(1, -1)