예제 #1
0
def SVM_Classifier(x_train, y_train, x_test, y_test):
    tic()
    clf = svm.SVC(C=2, probability=True)
    clf.fit(x_train, y_train)
    toc()

    print 'Test Accuracy: %.2f' % clf.score(x_train, y_train)

    #Create ROC curve
    tic()
    pred_probas = clf.predict_proba(x_train)[:, 1]  #score
    toc()

    # y_train = np.array(y_train)
    print type(y_train)
    print type(pred_probas)
    fpr, tpr, _ = metrics.roc_curve(y_train, pred_probas)
    roc_auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')
    plt.savefig(fdir + 'image/svm_roc')
    plt.show()
    plt.close()
예제 #2
0
def pca_reducing(size, x):
    tic()
    pca = PCA(n_components=size)
    pca.fit(x)
    toc()
    #print pca.explained_variance_ratio_

    # PCA drawing
    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.axes([.2, .2, .7, .7])
    plt.plot(pca.explained_variance_, linewidth=2)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('explained_variance_')
    plt.savefig(fdir + 'image/train_pca')
    # plt.show()
    plt.close()

    # get 100 dimensions according to the pca drawing
    x_pca = PCA(n_components=200).fit_transform(x)

    return x_pca
예제 #3
0
url = "https://book.douban.com/tag/历史"
s = quote(url,safe=string.printable)
req = urllib.request.Request(s, headers=headers)
html = urlopen(req)
# print(html.read().decode("utf-8"))
bsObj = BeautifulSoup(html,'lxml')
items = bsObj.findAll("li",class_="subject-item")
book_info = []
for item in items:
    info = []
    titles = item.find("a",title = re.compile(".*")).contents
    if len(titles)> 1 :
        bookname = str(titles[0].strip()) + str(titles[1].text.strip())
    else:
        bookname = titles[0].strip()
    publication =item.find("div",class_="pub").get_text().strip()
    comments = item.find("span", class_="rating_nums").text.strip()
    num = item.find("span", class_="pl").text.strip()
    brief = item.p.text.strip()
    info.append(bookname)
    info.append(publication)
    info.append(comments)
    info.append(num)
    info.append(brief)
    book_info.append(info)
    
column = ["书名","出版社","评分","参与人数","内容简介"]
all_books_info = pd.DataFrame(columns=column ,data= book_info)
all_books_info.to_csv(r"""C:\Users\95647\Desktop\douban_books.csv""")
toc()  #this scarpy use of total time