예제 #1
0
def lb_prop_classify(network, labels):
	kf = StratifiedKFold(n_splits=10)
	scores = []
	cms = []

	for test_index, train_index in kf.split(network ,labels):
		first_train_index, last_train_index = min(train_index), max(train_index)

		train_dataset = network[first_train_index:last_train_index]
		train_labels = labels[first_train_index:last_train_index]

		test_dataset = np.delete(network, np.s_[first_train_index:last_train_index], 0)
		test_labels = np.delete(labels, np.s_[first_train_index:last_train_index], 0)

		label_spreading_model = LabelPropagation()
		label_spreading_model.fit(train_dataset, train_labels)
		scores.append(label_spreading_model.score(test_dataset, test_labels))

		prediction = label_spreading_model.predict(test_dataset)
		cms.append(confusion_matrix(test_labels, prediction, label_spreading_model.classes_))

	print('label propagation media {}'.format(np.average(scores)))
	print('label propagation desvio padrao {}'.format(np.std(scores)))
	print('label propagation matriz de confusao')
	print(get_percentile_cm(get_average_cm(cms)))
	print('\n')

	return scores
예제 #2
0
def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]

    scores = []
    for K in Ks:
        clf = LabelPropagation(max_iter=100, n_neighbors=K, kernel='knn')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(Ks, scores)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()
예제 #3
0
def test_LabelPropagation_rbf(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0) \
                  , (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100, gamma=gamma, alpha=alpha, kernel='rbf')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc='best')
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
예제 #4
0
def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化
    '''
    X, y, unlabeled_indices = data
    # 必须拷贝,后面要用到 y
    y_train = np.copy(y)
    # 未标记样本的标记设定为 -1
    y_train[unlabeled_indices] = -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)

    scores = []
    for gamma in gammas:
        clf = LabelPropagation(max_iter=100, gamma=gamma, kernel='rbf')
        clf.fit(X, y_train)
        scores.append(clf.score(X[unlabeled_indices], y[unlabeled_indices]))
    ax.plot(gammas, scores)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
예제 #5
0
def test_LabelPropagation(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(X,y_train)
    true_labels = y[unlabeled_indices]
    print('Accuracy : %.2f' %clf.score(X[unlabeled_indices],true_labels))
예제 #6
0
def test_LabelPropagation(*data):
    x, y ,unlabeled_indices = data
    y_train = np.copy(y)  # 这里选择复制,后面要用到y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为-1
    clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=0.1)
    clf.fit(x, y_train)
    # 获取预测准确率
    true_labels = y[unlabeled_indices]  # 取得真实标记
    print("Accuracy: %f" % clf.score(x[unlabeled_indices], true_labels))
예제 #7
0
 def process(self, n_components):
     X_train, y_train, X_test, y_test = self.preprocess(n_components)
     label_prop_model = LabelPropagation(n_jobs=-1)
     label_prop_model.fit(X_train, y_train)
     y_pred = label_prop_model.predict(X_test)
     mean_acc = label_prop_model.score(X_test, y_test)
     plot_confusion_matrix(y_test,
                           y_pred,
                           self.labels,
                           normalize=False,
                           figname=('lp_comps_%d.png' % n_components))
     self.m_acc.append(mean_acc)
     print(label_prop_model.get_params())
예제 #8
0
def khren3(G):

    result_s = {}
    result_d = {}
    passed_set = []

    list_neighbrs = {}

    for v in G.nodes:
        list_neighbrs.update({v: set(nx.neighbors(G, v))})

    for u in G.nodes:
        passed_set.append(u)
        for v in nx.neighbors(G, u):
            if not v in passed_set:
                cmn_nmbr = list_neighbrs[u] & list_neighbrs[v]
                # dist = nx.shortest_path_length(G,u,v)
                # if dist == 2:
                # cmn_nmbr = G.distance(u,v)
                if G.nodes[u]["ground_label"] == G.nodes[v]['ground_label']:
                    result_s.update({(u, v): cmn_nmbr})
                else:
                    result_d.update({(u, v): cmn_nmbr})

    # max_s = max(len(result_s.values()))
    min_s = len(min(result_s.values(), key=len))
    min_d = len(min(result_d.values(), key=len))
    max_d = len(max(result_d.values(), key=len))

    for (pair, vertex_list) in result_d.items():
        if len(vertex_list) == max_d:
            max_pair = pair
            break

    print(min_s, min_d)

    adj_matrix = nx.adjacency_matrix(G).toarray()
    labels = [-1 for node in G.nodes]
    true_labels = [G.nodes[node]['ground_label'] for node in G.nodes]
    # labels[[0]] = 0
    labels[max_pair[0]] = 0
    labels[max_pair[1]] = 1
    # labels[0:10] = [0 for i in range(10)]
    # labels[900:910] = [1 for i in range(10)]

    lp = LabelPropagation(kernel='rbf', gamma=0.7, max_iter=1000)
    lp.fit(adj_matrix, labels)
    print(lp.score(adj_matrix, true_labels))

    return (result_s, result_d)
예제 #9
0
def test_LabelPropagation_rbf(*data):
    '''
    测试 LabelPropagation 的 rbf 核时,预测性能随 alpha 和 gamma 的变化

    :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return: None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    gammas = np.logspace(-2, 2, num=50)
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100,
                                   gamma=gamma,
                                   alpha=alpha,
                                   kernel='rbf')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(gammas, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$\gamma$")
    ax.set_ylabel("score")
    ax.set_xscale("log")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation rbf kernel")
    plt.show()
예제 #10
0
def test_LabelPropagation_knn(*data):
    '''
   测试 LabelPropagation 的 knn 核时,预测性能随 alpha 和 n_neighbors 的变化

    :param data:  一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
    :return:  None
    '''
    X, y, unlabeled_indices = data
    y_train = np.copy(y)  # 必须拷贝,后面要用到 y
    y_train[unlabeled_indices] = -1  # 未标记样本的标记设定为 -1

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = (
        (1, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (0.5, 0.5, 0),
        (0, 0.5, 0.5),
        (0.5, 0, 0.5),
        (0.4, 0.6, 0),
        (0.6, 0.4, 0),
        (0, 0.6, 0.4),
        (0.5, 0.3, 0.2),
    )  # 颜色集合,不同曲线用不同颜色
    ## 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelPropagation(max_iter=100,
                                   n_neighbors=K,
                                   alpha=alpha,
                                   kernel='knn')
            clf.fit(X, y_train)
            scores.append(clf.score(X[unlabeled_indices],
                                    y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)

    ### 设置图形
    ax.set_xlabel(r"$k$")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("LabelPropagation knn kernel")
    plt.show()
예제 #11
0
def ssl_label_prop(unlabel, clfs, true, x, y, test):
    for row in y:
        row = int(row)
    df_noise_x, df_noise_y, noisy_labels = shuffle.run(unlabel,
                                                       [-1] * len(unlabel), x,
                                                       y)
    ground = []
    point = []
    for row in test:
        ground.append(row[0])
        point.append(row[1:])
    # sklearn algo
    label_prop_model = LabelPropagation(kernel='knn',
                                        n_neighbors=2,
                                        max_iter=400,
                                        tol=0.01)
    label_prop_model.fit(df_noise_x, df_noise_y)
    return label_prop_model.score(point, ground)
예제 #12
0
def test_LabelPropagation_alpha_n_neighbors(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    
    alphas = np.linspace(0.01,1,num=2,endpoint=True)
    n_neighbors = [1,2,3,4,5,6,7,8,10,20,30,40,50]
    for i,alpha in enumerate(alphas):
        scores = []
        for n_neighbor in n_neighbors:
            clf = LabelPropagation(max_iter=1000, kernel='knn', n_neighbors=n_neighbor, alpha=alpha)
            clf.fit(X,y_train)
            true_labels = y[unlabeled_indices]
            scores.append(clf.score(X[unlabeled_indices],true_labels))
            
        ax.plot(n_neighbors,scores,label = 'alpha = %s' %alpha)
        ax.set_xlabel('n_neighbors')
        ax.set_ylabel('score')
        ax.set_xscale('log')
        ax.legend()
예제 #13
0
def test_LabelPropagation_alpha_gamma(*data):
    X,y,unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    
    alphas = np.linspace(0.01,1,num=10,endpoint=True)
    gammas = np.logspace(-2,2,num=5)
    for i,alpha in enumerate(alphas):
        scores = []
        for gamma in gammas:
            clf = LabelPropagation(max_iter=100, kernel='rbf', gamma=gamma, alpha=alpha)
            clf.fit(X,y_train)
            true_labels = y[unlabeled_indices]
            scores.append(clf.score(X[unlabeled_indices],true_labels))
            
        ax.plot(gammas,scores,label = 'alpha = %s' %alpha)
        ax.set_xlabel('gamma')
        ax.set_ylabel('score')
        ax.set_xscale('log')
        ax.legend()
예제 #14
0
def test_LabelPropagation_knn(*data):
    x, y, unlabeled_indices = data
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    alphas = np.linspace(0.01, 1, num=10, endpoint=True)
    Ks = [1, 2, 3, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40, 50]
    colors = ((1, 0, 0), (0, 1, 0), (0, 0, 1), (0.5, 0.5, 0), (0, 0.5, 0.5), (0.5, 0, 0.5), (0.4, 0.6, 0), (0.6, 0.4, 0),\
              (0, 0.6, 0.4), (0.5, 0.3, 0.2))  # 颜色集合,不同的曲线用不同的颜色
    # 训练并绘图
    for alpha, color in zip(alphas, colors):
        scores = []
        for K in Ks:
            clf = LabelPropagation(max_iter=100, n_neighbors=K, alpha=alpha, kernel='knn')
            clf.fit(x, y_train)
            scores.append(clf.score(x[unlabeled_indices], y[unlabeled_indices]))
        ax.plot(Ks, scores, label=r"$\alpha=%s$" % alpha, color=color)
    # 设置图形
    ax.set_xlabel(r"k")
    ax.set_ylabel("score")
    ax.legend(loc='best')
    ax.set_title("LabelPropagation knn kernel")
    plt.show()
예제 #15
0
def experemint_2(l=8):

    # Experiment comparing random walk, tSVM, SVM and our cluster kernel:
    tSVM = LabelPropagation(max_iter=5000)

    np.random.seed(133769)  # reproducibility
    x_mac, x_win, y_mac, y_win = get_data()
    x_test = np.vstack((x_mac[-500:], x_win[-500:]))
    y_test = np.hstack((y_mac[-500:], y_win[-500:]))

    y_test_tsvm = np.hstack((0.0 * y_mac[-500:], y_win[-500:]))

    x_mac, x_win, y_mac, y_win = x_mac[:
                                       -500], x_win[:
                                                    -500], y_mac[:
                                                                 -500], y_win[:
                                                                              -500]

    y_mac_tsvm = np.zeros((y_mac.shape))  # change -1 to zero

    x_labeled = np.vstack((x_mac[:l], x_win[:l]))
    x_unlabeled = np.vstack((x_mac[l:], x_win[l:]))

    X = np.vstack((x_labeled, x_unlabeled))
    y_labeled = np.hstack((y_mac[:l], y_win[:l]))
    y_labeled_tsvm = np.hstack((y_mac_tsvm[:l], y_win[:l]))
    y_unlabeled = np.hstack((y_mac[l:], y_win[l:]))
    y_unlabeled_tsvm = -np.ones((y_unlabeled.shape))  # Set unlabeled points
    labels_tsvm = np.hstack((y_labeled_tsvm, y_unlabeled_tsvm))

    acc_tSVM = np.array([None] * 100)
    acc_random_walk = np.array([None] * 100)
    acc_polyStep = np.array([None] * 100)
    acc_linear = np.array([None] * 100)

    kernel1 = lambda x: cluster_kernel.kernel(x, 10, "polyStep", 16)
    kernel2 = lambda x: cluster_kernel.kernel(x, 10, "linear", 16)

    for test in range(100):
        np.random.shuffle(x_mac)
        np.random.shuffle(x_win)
        x_labeled = np.vstack((x_mac[:l], x_win[:l]))
        x_unlabeled = np.vstack((x_mac[l:], x_win[l:]))

        y_labeled = np.hstack((y_mac[:l], y_win[:l]))
        X = np.vstack((x_labeled, x_unlabeled))

        tSVM.fit(X, labels_tsvm)

        acc_tSVM[test] = tSVM.score(x_test, y_test_tsvm)
        print(f'accuracy = {acc_tSVM[test] * 100}% () tSVM')

        acc_random_walk[test] = random_walk.random_walk(
            x_labeled, x_unlabeled, x_test, y_labeled, y_test)
        print(f'accuracy = {acc_random_walk[test] * 100}% () Random Walk')

        acc_polyStep[test] = evaluate_kernel(x_labeled, x_unlabeled, x_test,
                                             y_labeled, y_test, kernel1)
        print(f'accuracy = {acc_polyStep[test] * 100}% () Poly Step')

        acc_linear[test] = evaluate_kernel_SVM(x_labeled, x_unlabeled, x_test,
                                               y_labeled, y_test, kernel2)
        print(f'accuracy = {acc_linear[test] * 100}% () Linear')

        # acc[test] = evaluate_kernel_2(x_labeled_i, x_test, y_labeled, y_test, k)
        #acc[test] = evaluate_kernel(x_labeled, x_unlabeled, x_test, y_labeled, y_test, kernel)
        # acc[test] = random_walk.random_walk(x_labeled, x_unlabeled, x_test, y_labeled, y_test)

    print(
        f'normal SVM: accuracy = {acc_linear.mean() * 100}% (±{acc_linear.std() * 100:.2})'
    )
    print(
        f'tSVM: accuracy = {acc_tSVM.mean() * 100}% (±{acc_tSVM.std() * 100:.2})'
    )
    print(
        f'random walk: accuracy = {acc_random_walk.mean() * 100}% (±{acc_random_walk.std() * 100:.2})'
    )
    print(
        f'Cluster kernel: accuracy = {acc_polyStep.mean() * 100}% (±{acc_polyStep.std() * 100:.2})'
    )
print(iris.feature_names)
print(iris.data[:5])

# In[2]:

rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(len(iris.target)) < 0.8
labels = np.copy(iris.target)
print('supervised labels: ')
print(labels)
labels[random_unlabeled_points] = -1
print('semi supervised labels: ')
print(labels)

# In[3]:

from sklearn.semi_supervised import LabelPropagation
label_prop_model = LabelPropagation()

# In[4]:

label_prop_model.fit(iris.data, labels)

# In[5]:

label_prop_model.transduction_

# In[6]:

label_prop_model.score(X=iris.data, y=iris.target)
예제 #17
0
Y_test = list(test.Category.values)



label_prop_model = LabelPropagation(kernel="knn")

labels = np.copy(Y_train_l)

label_prop_model.fit(X_train_l, labels)

label_prop_model.predict(X_train_ul)

#y = test.Category.values.reshape(-1,1)

label_prop_model.score(X_test, test.Category.values)



label_prop_model = LabelSpreading()

labels = np.copy(Y_train_l)

label_prop_model.fit(X_train_l, labels)

label_prop_model.predict(X_train_ul)

#y = test.Category.values.reshape(-1,1)

label_prop_model.score(X_test, test.Category.values)
예제 #18
0
def do_machinea_leaning_stuff(train_X, train_Y, test_X, test_Y):
    returnValue = []
    test_predict_Y = []

    # de facut ceva cu acest rezultat
    #f_classif(X, y);

    #Algoritmi de clasificare
    rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    rfc.fit(train_X, train_Y)
    test_predict_Y = rfc.predict(test_X)
    returnValue.append({
        'name':
        "RandomForestClassifier",
        'score':
        rfc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    etc = ExtraTreesClassifier()
    etc.fit(train_X, train_Y)
    test_predict_Y = etc.predict(test_X)
    returnValue.append({
        'name':
        "ExtraTreesClassifier",
        'score':
        etc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    gpc = GaussianProcessClassifier(random_state=0)
    gpc.fit(train_X, train_Y)
    test_predict_Y = gpc.predict(test_X)
    # TODO : poate folosim si asta print(gpc.predict_proba(test_X))
    returnValue.append({
        'name':
        "GaussianProcessClassifier",
        'score':
        gpc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    pac = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)
    pac.fit(train_X, train_Y)
    test_predict_Y = pac.predict(test_X)
    returnValue.append({
        'name':
        "PassiveAggressiveClassifier",
        'score':
        pac.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    rc = RidgeClassifier()
    rc.fit(train_X, train_Y)
    test_predict_Y = rc.predict(test_X)
    returnValue.append({
        'name':
        "RidgeClassifier",
        'score':
        rc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    sgdc = SGDClassifier(max_iter=1000, tol=1e-3)
    sgdc.fit(train_X, train_Y)
    test_predict_Y = sgdc.predict(test_X)
    returnValue.append({
        'name':
        "SGDClassifier",
        'score':
        sgdc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    bnb = BernoulliNB()
    bnb.fit(train_X, train_Y)
    test_predict_Y = bnb.predict(test_X)
    returnValue.append({
        'name':
        "BernoulliNB",
        'score':
        bnb.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    knnc = KNeighborsClassifier(n_neighbors=3)
    knnc.fit(train_X, train_Y)
    test_predict_Y = knnc.predict(test_X)
    returnValue.append({
        'name':
        "KNeighborsClassifier",
        'score':
        knnc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    mlpc = MLPClassifier()
    mlpc.fit(train_X, train_Y)
    test_predict_Y = mlpc.predict(test_X)
    returnValue.append({
        'name':
        "MLPClassifier",
        'score':
        mlpc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    label_prop_model = LabelPropagation()
    rng = np.random.RandomState(42)
    random_unlabeled_points = rng.rand(len(train_Y)) < 0.3
    labels = np.copy(train_Y)
    labels[random_unlabeled_points] = -1
    label_prop_model.fit(train_X, labels)
    test_predict_Y = label_prop_model.predict(test_X)
    returnValue.append({
        'name':
        "LabelPropagation",
        'score':
        label_prop_model.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    lsvc = LinearSVC(random_state=0, tol=1e-5)
    lsvc.fit(train_X, train_Y)
    test_predict_Y = lsvc.predict(test_X)
    returnValue.append({
        'name':
        "LinearSVC",
        'score':
        label_prop_model.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    svc = SVC(gamma='auto')
    svc.fit(train_X, train_Y)
    test_predict_Y = svc.predict(test_X)
    returnValue.append({
        'name':
        "SVC",
        'score':
        svc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    dtc = DecisionTreeClassifier(random_state=0)
    dtc.fit(train_X, train_Y)
    test_predict_Y = dtc.predict(test_X)
    returnValue.append({
        'name':
        "DecisionTreeClassifier",
        'score':
        dtc.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    cccv = CalibratedClassifierCV()
    cccv.fit(train_X, train_Y)
    test_predict_Y = cccv.predict(test_X)
    returnValue.append({
        'name':
        "CalibratedClassifierCV",
        'score':
        cccv.score(test_X, test_Y),
        'accuracy_naive':
        (test_Y != test_predict_Y).sum() * 1.0 / len(test_predict_Y),
        'accuracy_score':
        accuracy_score(test_Y, test_predict_Y),
        'classification_report':
        classification_report(test_Y, test_predict_Y)
    })

    return returnValue
예제 #19
0
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Indefinido',
              'estilo_de_aprendizagem'] = 0
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Ativo',
              'estilo_de_aprendizagem'] = 1
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Teorico',
              'estilo_de_aprendizagem'] = 2
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Reflexivo',
              'estilo_de_aprendizagem'] = 3
datatrain.loc[datatrain['estilo_de_aprendizagem'] == 'Pragmatico',
              'estilo_de_aprendizagem'] = 4

datatrain = datatrain.apply(pd.to_numeric)
datatrain_array = datatrain.as_matrix()

X = datatrain_array[:, :14]
y = datatrain_array[:, 14:15]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

cv = LabelPropagation()
cv.fit(X_train, y_train)
precisao = cv.score(X_test, y_test)
print("------Acurácia-------: %f" % (precisao))
예제 #20
0
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(loss='log', shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred})
submission.head()
submission.to_csv('submission.csv', index=False)
submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred})
submission_tree.head()
submission_tree.to_csv('submission2.csv', index=False)
#Extra tree classifier is a tree based model for classification problems
et = ExtraTreeClassifier()
et.fit(x_train_3, y_train_3)
et.predict(x_train_3)
et.score(x_test_3, y_test_3)
from sklearn.semi_supervised import LabelPropagation
lb = LabelPropagation()
lb.fit(x_train_3, y_train_3)
lb.predict(x_train_3)
lb.score(x_test_3, y_test_3)
from sklearn.neighbors import KNeighborsClassifier
knng = KNeighborsClassifier()
knng.fit(x_train_3, y_train_3)
knng.predict(x_train_3)
knng.score(x_test_3, y_test_3)
    call_times.append(str(i) + '点通话次数')

df = df[['次均通话时长', '在网时长(单位:秒)', '当月活跃基站个数', '交往圈数量', 'avg']]
#df= df.drop(call_times,axis = 1 )
#df = df[['月累计短信发送数量','月累计流量使用情况(单位:字节)']]

print('start pca...')
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(df)
reduced_X_1 = reduced_X[:, 0]

reduced_X_2 = reduced_X[:, 1]

conponent = pd.DataFrame({'p1': reduced_X_1, 'p2': reduced_X_2, 'label': y})

X = conponent[['p1', 'p2']]
y = conponent['label']
sss = StratifiedShuffleSplit(n_splits=3,
                             train_size=0.0025,
                             test_size=0.0025,
                             random_state=0)
sss.get_n_splits(X, y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    label_prop_model = LabelPropagation(max_iter=5000)
    label_prop_model.fit(X_train, y_train)
    print(label_prop_model.score(X_test, y_test))
예제 #22
0
def label_propagation(x_train, y_train, x_test, y_test):
    from sklearn.semi_supervised import LabelPropagation
    sel = LabelPropagation()
    sel.fit(x_train, y_train)
    value = sel.score(x_test, y_test)
    return "{0:.2f}".format(value)
예제 #23
0
    def label_propagation(self,
                          kernel='rbf',
                          gamma=20,
                          n_neighbors=7,
                          max_iter=30,
                          tol=1e-3,
                          n_jobs=1):
        """
            Label Propagation classifier for semi-supervised learning

            Parameters
            ----------
            kernel : {'knn', 'rbf'}
                String identifier for kernel function to use or the kernel function
                itself. Only 'rbf' and 'knn' strings are valid inputs. The function
                passed should take two inputs, each of shape [n_samples, n_features],
                and return a [n_samples, n_samples] shaped weight matrix.

            gamma : float
                Parameter for rbf kernel

            n_neighbors : integer > 0
                Parameter for knn kernel

            alpha : float
                Clamping factor.


            max_iter : integer
                Change maximum number of iterations allowed

            tol : float
                Convergence tolerance: threshold to consider the system at steady
                state

            n_jobs : int or None, optional (default=None)
                The number of parallel jobs to run.
                ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
                ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
                for more details.

            Returns
            -------
            score : the score of learning model on test data

            Example
            --------
            >>> labeled_path = "../data/labeled.csv"
            >>> unlabeled_path = "../data/unlabeled.csv"
            >>> mtl = MultiTaskLearner(labeled_path, unlabeled_path)
            >>> encoding = mtl.embed(word_length=5)
            >>> X, y, X_t, y_t = train_test_split(mtl.sequences, mtl.labels, test_size=0.33)
            >>> score = mtl.semi_supervised_learner(X, y, X_t, y_t, ssl="label_propagation")
        """
        model = LabelPropagation(kernel=kernel,
                                 gamma=gamma,
                                 n_neighbors=n_neighbors,
                                 max_iter=max_iter,
                                 tol=tol,
                                 n_jobs=n_jobs)
        model.fit(self.X, self.y)
        return model.score(self.X_t, self.y_t)
예제 #24
0
        words = f.read().split("\n")
        while i < len(words):
            j = 0
            while j < len(newsgroups_train.data):
                newsgroups_train.data[j] = re.sub(words[i], '', newsgroups_train.data[j])
                j += 1
            i += 1
    f.close()
    print([newsgroups_train.data[0]])



# feature extraction
vectorizer = TfidfVectorizer(stop_words=get_stopwords())
vectors = vectorizer.fit_transform(newsgroups_train.data)


clf = LabelPropagation(kernel='rbf', gamma=0.89).fit(vectors.todense(), newsgroups_train.target)
test_vec = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(test_vec)
print(clf.score(test_vec, newsgroups_test.target))
print('f1 score: ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))

remove_regex_words()
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = LabelPropagation(kernel='rbf', gamma=0.89).fit(vectors.todense(), newsgroups_train.target)
test_vec = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(test_vec)
print(clf.score(test_vec, newsgroups_test.target))
print('f1 score: ', metrics.f1_score(newsgroups_test.target, pred, average='macro'))
예제 #25
0
def run_methods(x_c, y, x_e, z_c, z_y, z_e):
    x = np.concatenate((x_c, x_e), axis=1)
    z = np.concatenate((z_c, z_e), axis=1)

    # Baseline: Linear Logistic Regression
    lin_lr = LogisticRegression(random_state=0,
                                solver='liblinear').fit(x, y.ravel())
    acc_lin_lr = lin_lr.score(z, z_y)
    # hard_label_lin_lr = lin_lr.predict(z)
    # soft_label_lin_lr = lin_lr.predict_proba(z)[:, 1]

    # TRANSDUCTIVE APPROACHES
    # merge labelled and unlabelled data (with label -1) for transductive methods
    x_merged = np.concatenate((x, z))
    y_merged = np.concatenate((y, -1 * np.ones(
        (z.shape[0], 1)))).ravel().astype(int)

    # Baseline: Linear TSVM: https://github.com/tmadl/semisup-learn/tree/master/methods
    lin_tsvm = SKTSVM(kernel='linear')
    lin_tsvm.fit(x_merged, y_merged)
    acc_lin_tsvm = lin_tsvm.score(z, z_y)
    # hard_label_lin_tsvm = lin_tsvm.predict(z)
    # soft_label_lin_tsvm = lin_tsvm.predict_proba(z)[:, 1]

    # Baseline: Non-Linear TSVM:  https://github.com/tmadl/semisup-learn/tree/master/methods
    rbf_tsvm = SKTSVM(kernel='RBF')
    rbf_tsvm.fit(x_merged, y_merged)
    acc_rbf_tsvm = rbf_tsvm.score(z, z_y)
    # hard_label_rbf_tsvm = rbf_tsvm.predict(z)
    # soft_label_rbf_tsvm = rbf_tsvm.predict_proba(z)[:, 1]

    # Baseline: Label Propagation RBF weights
    try:
        rbf_label_prop = LabelPropagation(kernel='rbf')
        rbf_label_prop.fit(x_merged, y_merged)
        acc_rbf_label_prop = rbf_label_prop.score(z, z_y)
        # hard_label_rbf_label_prop= rbf_label_prop.predict(z)
        # soft_label_rbf_label_prop = rbf_label_prop.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_prop = []
        print 'rbf label prop did not work'

    # Baseline: Label Spreading with RBF weights
    try:
        rbf_label_spread = LabelSpreading(kernel='rbf')
        rbf_label_spread.fit(x_merged, y_merged)
        acc_rbf_label_spread = rbf_label_spread.score(z, z_y)
        # hard_label_rbf_label_spread = rbf_label_spread.predict(z)
        # soft_label_rbf_label_spread = rbf_label_spread.predict_proba(z)[:, 1]
    except:
        acc_rbf_label_spread = []
        print 'rbf label spread did not work '

    # THE K-NN VERSIONS ARE UNSTABLE UNLESS USING LARGE K
    # Baseline: Label Propagation with k-NN weights
    try:
        knn_label_prop = LabelPropagation(kernel='knn', n_neighbors=11)
        knn_label_prop.fit(x_merged, y_merged)
        acc_knn_label_prop = knn_label_prop.score(z, z_y)
        # hard_label_knn_label_prop = knn_label_prop.predict(z)
        # soft_label_knn_label_prop = knn_label_prop.predict_proba(z)[:, 1]
    except:
        acc_knn_label_prop = []
        print 'knn label prop did not work'

    # Baseline: Label Spreading with k-NN weights
    try:
        knn_label_spread = LabelSpreading(kernel='knn', n_neighbors=11)
        knn_label_spread.fit(x_merged, y_merged)
        acc_knn_label_spread = knn_label_spread.score(z, z_y)
        # hard_label_knn_label_spread = knn_label_spread.predict(z)
        # soft_label_knn_label_spread = knn_label_spread.predict_proba(z)[:, 1]
    except:
        acc_knn_label_spread = []
        print 'knn label spread did not work'

    # Generative Models
    # Semi-generative model on labelled data only
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e, converged=True)
    soft_label_semigen = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_semigen = soft_label_semigen > 0.5
    acc_semigen_labelled = np.mean(hard_label_semigen == z_y)

    # EM with soft labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = soft_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_soft_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_soft_EM = soft_label_soft_EM > 0.5
    acc_soft_EM = np.mean(hard_label_soft_EM == z_y)

    # EM with hard labels
    a_y, b_y, a_e0, a_e1, b_0, b_1, cov_e0, cov_e1 = hard_label_EM(
        x_c, y, x_e, z_c, z_e)
    soft_label_hard_EM = predict_class_probs(z_c, z_e, a_y, b_y, a_e0, a_e1,
                                             b_0, b_1, cov_e0, cov_e1)
    hard_label_hard_EM = soft_label_hard_EM > 0.5
    acc_hard_EM = np.mean(hard_label_hard_EM == z_y)

    # Conditional label prop
    acc_cond_prop = conditional_prop(x_c, y, x_e, z_c, z_y, z_e)

    return acc_lin_lr, acc_lin_tsvm, acc_rbf_tsvm, acc_rbf_label_prop, acc_rbf_label_spread, acc_knn_label_prop,\
           acc_knn_label_spread, acc_semigen_labelled, acc_soft_EM, acc_hard_EM, acc_cond_prop