def SVM_Classifier(x_train, y_train, x_test, y_test): tic() clf = svm.SVC(C=2, probability=True) clf.fit(x_train, y_train) toc() print 'Test Accuracy: %.2f' % clf.score(x_train, y_train) #Create ROC curve tic() pred_probas = clf.predict_proba(x_train)[:, 1] #score toc() # y_train = np.array(y_train) print type(y_train) print type(pred_probas) fpr, tpr, _ = metrics.roc_curve(y_train, pred_probas) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.savefig(fdir + 'image/svm_roc') plt.show() plt.close()
def pca_reducing(size, x): tic() pca = PCA(n_components=size) pca.fit(x) toc() #print pca.explained_variance_ratio_ # PCA drawing plt.figure(1, figsize=(4, 3)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('explained_variance_') plt.savefig(fdir + 'image/train_pca') # plt.show() plt.close() # get 100 dimensions according to the pca drawing x_pca = PCA(n_components=200).fit_transform(x) return x_pca
url = "https://book.douban.com/tag/历史" s = quote(url,safe=string.printable) req = urllib.request.Request(s, headers=headers) html = urlopen(req) # print(html.read().decode("utf-8")) bsObj = BeautifulSoup(html,'lxml') items = bsObj.findAll("li",class_="subject-item") book_info = [] for item in items: info = [] titles = item.find("a",title = re.compile(".*")).contents if len(titles)> 1 : bookname = str(titles[0].strip()) + str(titles[1].text.strip()) else: bookname = titles[0].strip() publication =item.find("div",class_="pub").get_text().strip() comments = item.find("span", class_="rating_nums").text.strip() num = item.find("span", class_="pl").text.strip() brief = item.p.text.strip() info.append(bookname) info.append(publication) info.append(comments) info.append(num) info.append(brief) book_info.append(info) column = ["书名","出版社","评分","参与人数","内容简介"] all_books_info = pd.DataFrame(columns=column ,data= book_info) all_books_info.to_csv(r"""C:\Users\95647\Desktop\douban_books.csv""") toc() #this scarpy use of total time