Пример #1
0
def developParser():
    '''
    规则开发例子
    '''

    test_parser = {
        'webname': '佛山市人民政府门户网站',
        'urls': ['http://www.foshan.gov.cn/'],
        'type': 'regular',
        'pattern':
        r"<li class=[\s\S]*?href='([\s\S]*?)' title='([\s\S]*?)'[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})",
        'position': {
            'title': 1,
            'href': 0,
            'time': 2
        }
    }
    html_parser = Html_Parser()
    r = Html_Downloader.download(test_parser['urls'][0])
    print(r)
    info_list, info_node = html_parser.parse(r, test_parser)
    for infoList in info_list:
        print(infoList)
    print('=============')
    for infoNode in info_node:
        print(infoNode)
Пример #2
0
 def crawl(self, parser):
     """
     爬取
     :param parser:
     :return:
     """
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxy_list = html_parser.parse(response, parser)
             if proxy_list is not None:
                 # 检查爬取到的proxy
                 count, new = 0, 0
                 for proxy in proxy_list:
                     count += 1
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies_set:
                         self.proxies_set.add(proxy_str)
                         new += 1
                         self.sqlhelper.insert(proxy)
                 self.url_count += 1
                 logger.info(
                     '%d/%d -- <%s> 获取%d, 未记录的%d' %
                     (self.url_count, self.url_total, url, count, new))
             else:
                 self.url_count += 1
                 logger.warning('%d/%d -- <%s> 解析数据错误' %
                                (self.url_count, self.url_total, url))
         else:
             self.url_count += 1
             logger.warning('%d/%d -- <%s> 下载页面错误' %
                            (self.url_count, self.url_total, url))
Пример #3
0
 def crawl(self, parser):
     proxys = []
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response != None:
             proxylist = html_parser.parse(response, parser)
             if proxylist != None:
                 proxys.extend(proxylist)
     return proxys
Пример #4
0
 def crawl(self, parser):
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxylist = html_parser.parse(response, parser)
             if proxylist is not None:
                 for proxy in proxylist:
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies:
                         self.proxies.add(proxy_str)
                         self.queue.put(proxy)
Пример #5
0
 def crawl(self, parser):
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxylist = html_parser.parse(response, parser)
             if proxylist is not None:
                 for proxy in proxylist:
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies:
                         self.proxies.add(proxy_str)
                         while True:
                             if self.queue.full():
                                 time.sleep(0.1)
                             else:
                                 self.queue.put(proxy)
                                 break
Пример #6
0
    def crawl(self, parser):
        html_parser = Html_Parser()
        for url in parser['urls']:
            print('crawl的URL是:', url)
            response = Html_Downloader.download(url)
            if response is not None:
                proxylist = html_parser.parse(response, parser)
                if proxylist is not None:
                    for proxy in proxylist:
                        proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                        if proxy_str not in self.proxies:
                            #							self.proxies.add(proxy_str) #感觉没作用
                            #							print('新爬取得代理IP')
                            #							print(proxy)

                            while True:

                                if self.queue.full():
                                    time.sleep(0.1)

                                else:
                                    #									print('将新的爬取到的代理IP放入队列')
                                    self.queue.put(proxy)
                                    break