示例#1
0
def run(key):
    url = set_url(host, key)
    Cookies()
    spider = Spider(url)
    html = spider.spider(BASEHEADERS)
    if not verify(html):
        BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str(
            ["acw_tc", "PHPSESSID"])
        proxieser.proxies()
    parser = HtmlParser(html)
    data = parser.parser("fund")

    print(data)
示例#2
0
class SpiderWork(object):
    def __init__(self):
        #初始化分布式进程工作节点的连接工程
        #实现第一步,使用BaseManager注册用于获取Queue的方法名称
        BaseManager.register('get_task_queue')
        BaseManager.register('get_result_queue')
        BaseManager.register('get_page_queue')
        BaseManager.register('get_data_queue')
        sever_addr = '127.0.0.1'

        print('Connect to sever %s...' % sever_addr)
        self.m = BaseManager(address=(sever_addr, 8001),
                             authkey='yuan'.encode('utf-8'))
        self.m.connect()
        self.task = self.m.get_task_queue()
        self.result = self.m.get_result_queue()
        self.page = self.m.get_page_queue()
        self.data = self.m.get_data_queue()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        print('*--------------------------------------------*')
        print('初始化完成')
        print('*--------------------------------------------*')

    def crawl(self):
        a = 1
        # exit()
        while True:
            try:
                if not self.page.empty():
                    page = self.page.get()
                    urls = self.downloader.download(page)
                    # print(len(urls))
                    self.result.put(urls)
                if not self.task.empty():
                    url = self.task.get()
                    if url == 'end':
                        print('控制节点通知爬虫节点停止工作')
                        self.result.put({'new_urls': 'end', 'data': 'end'})
                        return
                    print('爬虫节点正在解析第%s条' % a)
                    a = a + 1
                    data = self.parser.parser(url)
                    self.data.put(data)
            except EOFError:
                print('连接工作节点失败')
                return
            except Exception as e:
                print(e)
                print('Crawl fail')
示例#3
0
def get_title(response):
    # title = HtmlParser(response).parser("title")
    parser = HtmlParser(response)
    title = parser.parser("title")

    return title