def __init__(self, worker_num=10, chunk_size=10000, log_interval=600, data_dir='data', log_dir='log'): self.chunk_size = chunk_size self.log_interval = log_interval self.urls = Queue() self.results = Queue() self.url_cache = Set() self.name_cache = Set() self.black_urls = Set() self.black_cache = Dict() self.chunk_num = 0 self.parser = HtmlParser(home='https://baike.baidu.com') self.last = 0 self.state = 1 if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) self.data_dir = data_dir self.log_dir = log_dir self.writer = Thread(target=self._write) self.logger = Timer(log_interval, self._log) self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)]
def __init__(self, provider=None, charset=None): ''' target: (url, charset, provider) url - 页面url provider - 解析页面的 parse provider (js文件) charset - 页面编码 ''' self._targets = [] self._proceeding_cursor = 0 self.parser = HtmlParser() self._default_request_headers = {} #默认的http请求头 self._default_provider = provider self._default_charset = charset
def gl(self, html): hrefs = set() parser = HtmlParser(html) for href in parser.hrefs: u_parse = urlparse(href) if u_parse.netloc == '' or u_parse.netloc == self.domain: hrefs.add(href) return hrefs
def find_signature(self, html_content): # return True if it finds signature in script content else False # we have to parse HTML since script only works if HTML is correct syntax. parser = HtmlParser(html_content) if any(self.SIGNATURE in s for s in parser.script_text): return True else: return False
def parseHtml(self, base_url, html): try: parser = HtmlParser() parsing_result = parser.parse(html, False) [title_text, full_text, core_text, rest_text, all_links, etc_datas] = parsing_result tree = parser.tree title = "" body = "" write_time = 0 imgs = None res = dict() s_id = tree.root.id view_node = tree.root e_id = view_node.findENode() ( extractedContent, rest_text, links_in_summary, imgs, core_len, text_list, ) = tree.root.getTextImageWithPosition(s_id, e_id, "IN") body_pieces = extractedContent.split() body_text = " ".join(body_pieces) body = body_text.replace("|11818|", "&") res["all_links"] = all_links res["links"] = links_in_summary res["image_count"] = len(imgs) res["images"] = list() for i_link in imgs: r_image = parser.makePerfectURL(i_link) res["images"].append(r_image) res["embed_links"] = parser.embed_links res["meta_data"] = parser.meta_dict res["body_extension"] = rest_text.strip() return body, res except Exception, msg: pass
class SpiderWorker(object): def __init__(self, url, size=20): self.url = url self.pool = ProxiesPool() self.parser = HtmlParser(url) self.url_manager = URLSManager(url_pattern=url, size=size) self.writer = FileWriter() @Decorator.time def start(self): self.url_manager.add_url(self.url) while self.url_manager.has_next(): hd = HtmlDownloader(proxies=self.pool.get_proxy_ip()) url = self.url_manager.get_url() data = hd.download(url) urls = self.parser.simple_tags(data, 'a', attributes=['href']) self.url_manager.add_urls([url_.get('href') for url_ in urls]) title = self.parser.element(data, 'title') title = title.getText() if title else 'unknown' self.writer.load_data('[%s] %s' % (title, url)) self.writer.writer()
def __init__(self, provider = None, charset = None): ''' target: (url, charset, provider) url - 页面url provider - 解析页面的 parse provider (js文件) charset - 页面编码 ''' self._targets = [] self._proceeding_cursor = 0 self.parser = HtmlParser() self._default_request_headers = {} #默认的http请求头 self._default_provider = provider self._default_charset = charset
def __update_proxy_pool(self): downloader = HtmlDownloader() proxy_pool = ProxiesPool() parser = HtmlParser() data = downloader.download(self.proxy_site) speed_times = parser.multilevel_tags(data, [{'tr': None}, {'div': {'class': 'bar'}}]) ip_data = parser.elements(data, 'tr')[1:] speed = speed_times[::2] times = speed_times[1::2] for i, ip in enumerate(ip_data): d = {} for j, value in enumerate(filter(lambda x: x, ip_data[i].text.split('\n'))): if j == 0: d['ip'] = value elif j == 1: d['port'] = value continue if len(d.keys()) != 2: continue if self.__re_number(speed[i].get('title')) > 1 \ or self.__re_number(times[i].get('title')) > 1: continue proxy_pool.add({'http': '%s:%s' % (d.get('ip'), d.get('port'))})
def get_links(self, html): """ Parse return link in html contents by finding href attribute in a tag. """ hrefs = set() parser = HtmlParser(html) # get href tags from parsed results for href in parser.hrefs: u_parse = urlparse(href) # check whether href content is same domain with seed url if u_parse.netloc == '' or u_parse.netloc == self.domain: hrefs.add(href) return hrefs
class Spider(): _proceed_num = 10 #最大抓取页面数 _proceed_wait = 2 #每个页面抓取之间间隔 ''' (parse) provider 是一段js脚本,用来解析页面dom,产生数据 数据再通过 __callback__ 传给 Spider 的 onparsed 事件 ''' _default_provider = None #默认的provider方法 _default_charset = None logger = logger().instance() def __init__(self, provider=None, charset=None): ''' target: (url, charset, provider) url - 页面url provider - 解析页面的 parse provider (js文件) charset - 页面编码 ''' self._targets = [] self._proceeding_cursor = 0 self.parser = HtmlParser() self._default_request_headers = {} #默认的http请求头 self._default_provider = provider self._default_charset = charset @staticmethod def setupCookies(): ''' 可开启cookie支持,用于需登录验证等页面的抓取 ''' cookie_jar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) urllib2.install_opener(opener) def setRequestHeader(self, key, value): self._default_request_headers[key] = value def setUserAgent(self, ua=Webkit().userAgent): ''' 请求带上 User-Agent ''' self.setRequestHeader('User-Agent', ua) def append(self, url, charset=None, provider=None, headers={}, body={}): ''' 添加一个新的抓取目标 ''' request_headers = {} request_headers.update(self._default_request_headers) request_headers.update(headers) #self.logger.info(request_headers) self._targets.append( (url, charset or self._default_charset, provider or self._default_provider, request_headers, body)) def proceed(self, proceed_num=None, proceed_wait=None): proceed_num = proceed_num or self._proceed_num proceed_wait = proceed_wait or self._proceed_wait while (self._proceeding_cursor < len(self._targets) and self._proceeding_cursor < proceed_num): try: target = self._targets[self._proceeding_cursor] self.logger.info('new proceeding... target: ' + target[0]) (url, charset, provider, headers, body) = target script = "require('" + provider + "');" self.parser.parse(url, charset, headers, body, script, callback=self.onparsed) except Exception, ex: self.logger.error(ex) self.logger.debug(traceback.format_exc()) finally:
def _read_in_format(self, file): return HtmlParser(file)
def __init__(self, url, size=20): self.url = url self.pool = ProxiesPool() self.parser = HtmlParser(url) self.url_manager = URLSManager(url_pattern=url, size=size) self.writer = FileWriter()
class Spider(object): def __init__(self, worker_num=10, chunk_size=10000, log_interval=600, data_dir='data', log_dir='log'): self.chunk_size = chunk_size self.log_interval = log_interval self.urls = Queue() self.results = Queue() self.url_cache = Set() self.name_cache = Set() self.black_urls = Set() self.black_cache = Dict() self.chunk_num = 0 self.parser = HtmlParser(home='https://baike.baidu.com') self.last = 0 self.state = 1 if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) self.data_dir = data_dir self.log_dir = log_dir self.writer = Thread(target=self._write) self.logger = Timer(log_interval, self._log) self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)] def start(self, url): new_urls, new_data = self.parser.parse(url) self.results.put(new_data) self.url_cache.add(url) self.name_cache.add(new_data['name']) for url in new_urls: self.urls.put(url) self.logger.start() self.writer.start() for spider in self.spiders: spider.start() def _write(self): """只使用self.results """ while self.state: self.chunk_num += 1 n = 0 with open( os.path.join(self.data_dir, '{}.json'.format(self.chunk_num)), 'wb') as fp: while n < self.chunk_size: if not self.results.empty(): result = self.results.get() line = json.dumps(result, ensure_ascii=False) + '\n' fp.write(line.encode('utf8')) n += 1 else: sleep(10) def _log(self): now = len(self.name_cache) increase = now - self.last self.last = now if increase == 0: self.state = 0 print('Exit: no entities scraped in this round.') exit() else: with open(os.path.join(self.log_dir, 'log'), 'ab+') as fp: message = '新增词条数量:{},已抓取词条数量:{};已获取url数量:{},缓存任务数量:{},缓存结果数量:{}.'.format( increase, now, len(self.url_cache), self.urls._qsize(), self.results._qsize(), ) + '\n' fp.write(message.encode('utf8')) timer = Timer(self.log_interval, self._log) timer.start() def _scrap(self): while self.state: if not self.urls.empty(): url = self.urls.get() try: new_urls, new_data = self.parser.parse(url) except: self.url_cache.remove(url) # 多次请求不成功的url加入黑名单 if url not in self.black_cache: self.black_cache[url] = 1 self.black_cache[url] += 1 if self.black_cache[url] >= 3: self.black_urls.add(url) continue name = new_data['name'] if name not in self.name_cache: self.name_cache.add(name) if new_data['infomation']: #剔除没有属性信息的词条 self.results.put(new_data) for url in new_urls: if url not in self.url_cache and url not in self.black_urls: self.url_cache.add(url) self.urls.put(url) else: sleep(10)
# -*- coding: utf-8-*- import sys, requests, json, os, io from parser import HtmlParser from datahandler import DataHandler if __name__ == '__main__': parse = HtmlParser() datahandler = DataHandler() datahandler.SaveFiles() # 判断要检索的范围 downList = [] with open('./data/error_page.txt') as data_file: downList = json.load(data_file) parser = HtmlParser() parser.pages = downList parser.bar = ProgressBar(total=len(downList)) parser.parse()
class Spider(): _proceed_num = 10 #最大抓取页面数 _proceed_wait = 2 #每个页面抓取之间间隔 ''' (parse) provider 是一段js脚本,用来解析页面dom,产生数据 数据再通过 __callback__ 传给 Spider 的 onparsed 事件 ''' _default_provider = None #默认的provider方法 _default_charset = None logger = logger().instance() def __init__(self, provider = None, charset = None): ''' target: (url, charset, provider) url - 页面url provider - 解析页面的 parse provider (js文件) charset - 页面编码 ''' self._targets = [] self._proceeding_cursor = 0 self.parser = HtmlParser() self._default_request_headers = {} #默认的http请求头 self._default_provider = provider self._default_charset = charset @staticmethod def setupCookies(): ''' 可开启cookie支持,用于需登录验证等页面的抓取 ''' cookie_jar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) urllib2.install_opener(opener) def setRequestHeader(self, key, value): self._default_request_headers[key] = value def setUserAgent(self, ua = Webkit().userAgent): ''' 请求带上 User-Agent ''' self.setRequestHeader('User-Agent', ua) def append(self, url, charset = None , provider = None, headers = {}, body = {}): ''' 添加一个新的抓取目标 ''' request_headers = {} request_headers.update(self._default_request_headers) request_headers.update(headers) #self.logger.info(request_headers) self._targets.append((url, charset or self._default_charset, provider or self._default_provider, request_headers, body)) def proceed(self, proceed_num = None, proceed_wait = None): proceed_num = proceed_num or self._proceed_num proceed_wait = proceed_wait or self._proceed_wait while(self._proceeding_cursor < len(self._targets) and self._proceeding_cursor < proceed_num): try: target = self._targets[self._proceeding_cursor] self.logger.info('new proceeding... target: ' + target[0]) (url, charset, provider, headers, body) = target script = "require('" + provider + "');" self.parser.parse(url, charset, headers, body, script, callback=self.onparsed) except Exception, ex: self.logger.error(ex) self.logger.debug(traceback.format_exc()) finally: