示例#1
0
    def __init__(self,
                 worker_num=10,
                 chunk_size=10000,
                 log_interval=600,
                 data_dir='data',
                 log_dir='log'):
        self.chunk_size = chunk_size
        self.log_interval = log_interval
        self.urls = Queue()
        self.results = Queue()
        self.url_cache = Set()
        self.name_cache = Set()
        self.black_urls = Set()
        self.black_cache = Dict()
        self.chunk_num = 0
        self.parser = HtmlParser(home='https://baike.baidu.com')

        self.last = 0
        self.state = 1

        if not os.path.exists(data_dir):
            os.mkdir(data_dir)
        if not os.path.exists(log_dir):
            os.mkdir(log_dir)
        self.data_dir = data_dir
        self.log_dir = log_dir

        self.writer = Thread(target=self._write)
        self.logger = Timer(log_interval, self._log)
        self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)]
示例#2
0
    def __init__(self, provider=None, charset=None):
        '''
			target: (url, charset, provider)
				url - 页面url
				provider - 解析页面的 parse provider (js文件)
				charset - 页面编码 
		'''
        self._targets = []
        self._proceeding_cursor = 0
        self.parser = HtmlParser()
        self._default_request_headers = {}  #默认的http请求头
        self._default_provider = provider
        self._default_charset = charset
示例#3
0
 def gl(self, html):
     hrefs = set()
     parser = HtmlParser(html)
     for href in parser.hrefs:
         u_parse = urlparse(href)
         if u_parse.netloc == '' or u_parse.netloc == self.domain:
             hrefs.add(href)
     return hrefs
 def find_signature(self, html_content):
     # return True if it finds signature in script content else False
     # we have to parse HTML since script only works if HTML is correct syntax.
     parser = HtmlParser(html_content)
     if any(self.SIGNATURE in s for s in parser.script_text):
         return True
     else:
         return False
示例#5
0
    def parseHtml(self, base_url, html):
        try:
            parser = HtmlParser()
            parsing_result = parser.parse(html, False)
            [title_text, full_text, core_text, rest_text, all_links, etc_datas] = parsing_result

            tree = parser.tree
            title = ""
            body = ""
            write_time = 0
            imgs = None
            res = dict()

            s_id = tree.root.id
            view_node = tree.root
            e_id = view_node.findENode()
            (
                extractedContent,
                rest_text,
                links_in_summary,
                imgs,
                core_len,
                text_list,
            ) = tree.root.getTextImageWithPosition(s_id, e_id, "IN")
            body_pieces = extractedContent.split()
            body_text = " ".join(body_pieces)
            body = body_text.replace("|11818|", "&")

            res["all_links"] = all_links
            res["links"] = links_in_summary
            res["image_count"] = len(imgs)

            res["images"] = list()
            for i_link in imgs:
                r_image = parser.makePerfectURL(i_link)
                res["images"].append(r_image)
            res["embed_links"] = parser.embed_links
            res["meta_data"] = parser.meta_dict
            res["body_extension"] = rest_text.strip()
            return body, res
        except Exception, msg:
            pass
示例#6
0
class SpiderWorker(object):
    def __init__(self, url, size=20):
        self.url = url
        self.pool = ProxiesPool()
        self.parser = HtmlParser(url)
        self.url_manager = URLSManager(url_pattern=url, size=size)
        self.writer = FileWriter()

    @Decorator.time
    def start(self):
        self.url_manager.add_url(self.url)
        while self.url_manager.has_next():
            hd = HtmlDownloader(proxies=self.pool.get_proxy_ip())
            url = self.url_manager.get_url()
            data = hd.download(url)
            urls = self.parser.simple_tags(data, 'a', attributes=['href'])
            self.url_manager.add_urls([url_.get('href') for url_ in urls])
            title = self.parser.element(data, 'title')
            title = title.getText() if title else 'unknown'
            self.writer.load_data('[%s] %s' % (title, url))
        self.writer.writer()
示例#7
0
	def __init__(self, provider = None, charset = None):
		'''
			target: (url, charset, provider)
				url - 页面url
				provider - 解析页面的 parse provider (js文件)
				charset - 页面编码 
		'''
		self._targets = []
		self._proceeding_cursor = 0
		self.parser = HtmlParser()
		self._default_request_headers = {} #默认的http请求头
		self._default_provider = provider
		self._default_charset = charset
示例#8
0
    def __update_proxy_pool(self):
        downloader = HtmlDownloader()
        proxy_pool = ProxiesPool()
        parser = HtmlParser()
        data = downloader.download(self.proxy_site)
        speed_times = parser.multilevel_tags(data, [{'tr': None}, {'div': {'class': 'bar'}}])
        ip_data = parser.elements(data, 'tr')[1:]
        speed = speed_times[::2]
        times = speed_times[1::2]
        for i, ip in enumerate(ip_data):
            d = {}
            for j, value in enumerate(filter(lambda x: x, ip_data[i].text.split('\n'))):
                if j == 0:
                    d['ip'] = value
                elif j == 1:
                    d['port'] = value
                continue
            if len(d.keys()) != 2:
                continue
            if self.__re_number(speed[i].get('title')) > 1 \
                    or self.__re_number(times[i].get('title')) > 1:
                continue

            proxy_pool.add({'http': '%s:%s' % (d.get('ip'), d.get('port'))})
    def get_links(self, html):
        """
        Parse return link in html contents
        by finding href attribute in a tag.
        """

        hrefs = set()
        parser = HtmlParser(html)

        # get href tags from parsed results
        for href in parser.hrefs:
            u_parse = urlparse(href)

            # check whether href content is same domain with seed url
            if u_parse.netloc == '' or u_parse.netloc == self.domain:
                hrefs.add(href)
        return hrefs
示例#10
0
class Spider():
    _proceed_num = 10  #最大抓取页面数
    _proceed_wait = 2  #每个页面抓取之间间隔
    '''
		(parse) provider 是一段js脚本,用来解析页面dom,产生数据
		数据再通过 __callback__ 传给 Spider 的 onparsed 事件
	'''
    _default_provider = None  #默认的provider方法
    _default_charset = None

    logger = logger().instance()

    def __init__(self, provider=None, charset=None):
        '''
			target: (url, charset, provider)
				url - 页面url
				provider - 解析页面的 parse provider (js文件)
				charset - 页面编码 
		'''
        self._targets = []
        self._proceeding_cursor = 0
        self.parser = HtmlParser()
        self._default_request_headers = {}  #默认的http请求头
        self._default_provider = provider
        self._default_charset = charset

    @staticmethod
    def setupCookies():
        '''
			可开启cookie支持,用于需登录验证等页面的抓取
		'''
        cookie_jar = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
        urllib2.install_opener(opener)

    def setRequestHeader(self, key, value):
        self._default_request_headers[key] = value

    def setUserAgent(self, ua=Webkit().userAgent):
        '''
			请求带上 User-Agent
		'''
        self.setRequestHeader('User-Agent', ua)

    def append(self, url, charset=None, provider=None, headers={}, body={}):
        '''
			添加一个新的抓取目标
		'''
        request_headers = {}
        request_headers.update(self._default_request_headers)
        request_headers.update(headers)

        #self.logger.info(request_headers)

        self._targets.append(
            (url, charset or self._default_charset, provider
             or self._default_provider, request_headers, body))

    def proceed(self, proceed_num=None, proceed_wait=None):

        proceed_num = proceed_num or self._proceed_num
        proceed_wait = proceed_wait or self._proceed_wait

        while (self._proceeding_cursor < len(self._targets)
               and self._proceeding_cursor < proceed_num):
            try:
                target = self._targets[self._proceeding_cursor]
                self.logger.info('new proceeding... target: ' + target[0])
                (url, charset, provider, headers, body) = target
                script = "require('" + provider + "');"
                self.parser.parse(url,
                                  charset,
                                  headers,
                                  body,
                                  script,
                                  callback=self.onparsed)
            except Exception, ex:
                self.logger.error(ex)
                self.logger.debug(traceback.format_exc())
            finally:
示例#11
0
 def _read_in_format(self, file):
     return HtmlParser(file)
示例#12
0
 def __init__(self, url, size=20):
     self.url = url
     self.pool = ProxiesPool()
     self.parser = HtmlParser(url)
     self.url_manager = URLSManager(url_pattern=url, size=size)
     self.writer = FileWriter()
示例#13
0
class Spider(object):
    def __init__(self,
                 worker_num=10,
                 chunk_size=10000,
                 log_interval=600,
                 data_dir='data',
                 log_dir='log'):
        self.chunk_size = chunk_size
        self.log_interval = log_interval
        self.urls = Queue()
        self.results = Queue()
        self.url_cache = Set()
        self.name_cache = Set()
        self.black_urls = Set()
        self.black_cache = Dict()
        self.chunk_num = 0
        self.parser = HtmlParser(home='https://baike.baidu.com')

        self.last = 0
        self.state = 1

        if not os.path.exists(data_dir):
            os.mkdir(data_dir)
        if not os.path.exists(log_dir):
            os.mkdir(log_dir)
        self.data_dir = data_dir
        self.log_dir = log_dir

        self.writer = Thread(target=self._write)
        self.logger = Timer(log_interval, self._log)
        self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)]

    def start(self, url):
        new_urls, new_data = self.parser.parse(url)
        self.results.put(new_data)
        self.url_cache.add(url)
        self.name_cache.add(new_data['name'])
        for url in new_urls:
            self.urls.put(url)

        self.logger.start()
        self.writer.start()
        for spider in self.spiders:
            spider.start()

    def _write(self):
        """只使用self.results
        """
        while self.state:
            self.chunk_num += 1
            n = 0
            with open(
                    os.path.join(self.data_dir,
                                 '{}.json'.format(self.chunk_num)),
                    'wb') as fp:
                while n < self.chunk_size:
                    if not self.results.empty():
                        result = self.results.get()
                        line = json.dumps(result, ensure_ascii=False) + '\n'
                        fp.write(line.encode('utf8'))
                        n += 1
                    else:
                        sleep(10)

    def _log(self):
        now = len(self.name_cache)
        increase = now - self.last
        self.last = now
        if increase == 0:
            self.state = 0
            print('Exit: no entities scraped in this round.')
            exit()
        else:
            with open(os.path.join(self.log_dir, 'log'), 'ab+') as fp:
                message = '新增词条数量:{},已抓取词条数量:{};已获取url数量:{},缓存任务数量:{},缓存结果数量:{}.'.format(
                    increase,
                    now,
                    len(self.url_cache),
                    self.urls._qsize(),
                    self.results._qsize(),
                ) + '\n'
                fp.write(message.encode('utf8'))
        timer = Timer(self.log_interval, self._log)
        timer.start()

    def _scrap(self):
        while self.state:
            if not self.urls.empty():
                url = self.urls.get()
                try:
                    new_urls, new_data = self.parser.parse(url)
                except:
                    self.url_cache.remove(url)
                    # 多次请求不成功的url加入黑名单
                    if url not in self.black_cache:
                        self.black_cache[url] = 1
                    self.black_cache[url] += 1
                    if self.black_cache[url] >= 3:
                        self.black_urls.add(url)
                    continue
                name = new_data['name']
                if name not in self.name_cache:
                    self.name_cache.add(name)
                    if new_data['infomation']:  #剔除没有属性信息的词条
                        self.results.put(new_data)
                for url in new_urls:
                    if url not in self.url_cache and url not in self.black_urls:
                        self.url_cache.add(url)
                        self.urls.put(url)
            else:
                sleep(10)
示例#14
0
# -*- coding: utf-8-*-
import sys, requests, json, os, io
from parser import HtmlParser
from datahandler import DataHandler
if __name__ == '__main__':
    parse = HtmlParser()

    datahandler = DataHandler()
    datahandler.SaveFiles()
    # 判断要检索的范围

    downList = []
    with open('./data/error_page.txt') as data_file:
        downList = json.load(data_file)

    parser = HtmlParser()
    parser.pages = downList
    parser.bar = ProgressBar(total=len(downList))
    parser.parse()
示例#15
0
class Spider():	
	_proceed_num = 10 #最大抓取页面数
	_proceed_wait = 2 #每个页面抓取之间间隔

	'''
		(parse) provider 是一段js脚本,用来解析页面dom,产生数据
		数据再通过 __callback__ 传给 Spider 的 onparsed 事件
	'''
	_default_provider = None #默认的provider方法
	_default_charset = None

	logger = logger().instance()

	def __init__(self, provider = None, charset = None):
		'''
			target: (url, charset, provider)
				url - 页面url
				provider - 解析页面的 parse provider (js文件)
				charset - 页面编码 
		'''
		self._targets = []
		self._proceeding_cursor = 0
		self.parser = HtmlParser()
		self._default_request_headers = {} #默认的http请求头
		self._default_provider = provider
		self._default_charset = charset

	@staticmethod
	def setupCookies():
		'''
			可开启cookie支持,用于需登录验证等页面的抓取
		'''
		cookie_jar = cookielib.CookieJar()
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))  
		urllib2.install_opener(opener)  

	def setRequestHeader(self, key, value):
		self._default_request_headers[key] = value

	def setUserAgent(self, ua = Webkit().userAgent):
		'''
			请求带上 User-Agent
		'''
		self.setRequestHeader('User-Agent', ua)

	def append(self, url, charset = None , provider = None, headers = {}, body = {}):
		'''
			添加一个新的抓取目标
		'''
		request_headers = {}
		request_headers.update(self._default_request_headers)
		request_headers.update(headers)

		#self.logger.info(request_headers)

		self._targets.append((url, charset or self._default_charset, provider or self._default_provider, request_headers, body))		

	def proceed(self, proceed_num = None, proceed_wait = None):
		
		proceed_num = proceed_num or self._proceed_num
		proceed_wait = proceed_wait or self._proceed_wait

		while(self._proceeding_cursor < len(self._targets) and self._proceeding_cursor < proceed_num):
			try:
				target = self._targets[self._proceeding_cursor]
				self.logger.info('new proceeding... target: ' + target[0])
				(url, charset, provider, headers, body) = target
				script = "require('" + provider + "');"
				self.parser.parse(url, charset, headers, body, script, callback=self.onparsed)
			except Exception, ex:
				self.logger.error(ex)
				self.logger.debug(traceback.format_exc())
			finally: