示例#1
0
    def test_importcheck_thread_safety(self, datapath):
        # see gh-16928

        class ErrorThread(threading.Thread):
            def run(self):
                try:
                    super(ErrorThread, self).run()
                except Exception as e:
                    self.err = e
                else:
                    self.err = None

        # force import check by reinitalising global vars in html.py
        reload(pandas.io.html)

        filename = datapath('io', 'data', 'valid_markup.html')
        helper_thread1 = ErrorThread(target=self.read_html, args=(filename,))
        helper_thread2 = ErrorThread(target=self.read_html, args=(filename,))

        helper_thread1.start()
        helper_thread2.start()

        while helper_thread1.is_alive() or helper_thread2.is_alive():
            pass
        assert None is helper_thread1.err is helper_thread2.err
import sys
import time
import urllib
import numpy as np
from bs4 import BeautifulSoup
from openpyxl import Workbook
from pandas.compat import reload

reload(sys)

# Some User Agents
hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]


def book_spider(book_tag):
    page_num = 0
    book_list = []
    try_times = 0

    while (1):
        url = 'http://www.douban.com/tag/' + urllib.request.quote(
            book_tag) + '/book?start=' + str(page_num * 15)
        #time.sleep(np.random.rand() * 5)

        # Last Version
        try:
            req = urllib.request.Request(url, headers=hds[page_num % len(hds)])
            source_code = urllib.request.urlopen(req).read()