def find_webpage(self, link_url = "http://www.baidu.com/"): url_struct = get_url_info(link_url); domain = url_struct.get("domain", "baidu.com"); urls_info = self.webpage_db[domain].find({'url':link_url}) for url_info in urls_info: url_info['_id'] = str(url_info['_id']) return url_info;
def find_linkbase(self, link_url = "http://www.baidu.com/"): url_struct = get_url_info(link_url); domain = url_struct.get("domain", "baidu.com"); urls_info = self.linkbase_db[domain].find({'url':link_url}) for url_info in urls_info: json_dict = {}; link_str = url_info['link_attr'] link_attr = pickle.loads(link_str) if not link_attr: return None if not link_attr.url: link_attr.url = link_url link_attr = vars(link_attr) for key,val in link_attr.items(): if not val: json_dict[key] = None elif key == "crawl_info" or key == "parent_info" or key == "page_info" or key == 'extract_message': json_dict[key] = vars(val) elif key == "normal_crawl_his" and val: json_dict[key] = [] for his in val: json_dict[key].append(vars(his)) else: json_dict[key] = str(val) return json_dict
def nor_to_hz(self, request): if request.post_data and request.method == "post": url_info = get_url_info(request.url) query_info = url_query_decode(url_info.get('query')) query_info['HZPOST'] = base64_encode_json(request.post_data) hz_url = request.url.split("?")[0] + "?" + urllib.urlencode( query_info) request.url = hz_url
def hz_to_nor(self, request): url_info = get_url_info(request.url) query_info = url_query_decode(url_info.get('query')) request.post_data = base64_decode_json(query_info.get('HZPOST')) query_info.pop('HZPOST') nor_url = request.url.split("?")[0] + "?" + urllib.urlencode( query_info) request.url = nor_url request.method = 'post'
def _get_next_page_by_get_count(self): next_page = self.base_url url_info = get_url_info(self.base_url) query_info = url_query_decode(url_info.get('query')) new_query = self._build_count_query(query_info) if not new_query: return None else: next_page = next_page.split('?')[0] + "?" + urllib.urlencode( new_query) return next_page
def is_matched(url, rule): site_prefix = rule.get('site_prefix') url_format = rule.get('url_format') url_type = rule.get('url_type') site = get_url_info(url).get('site') if isinstance(site_prefix, basestring): m_site = re.match(site_prefix, site) if m_site and isinstance(url_format, basestring): m_url = re.match(url_format, url) if m_url and isinstance(url_type, int): return True return False
def pack_base_info(self, download_rsp): """ :param DonwloadRsp: :return BaseInfo: """ base_info = BaseInfo() base_info.url = download_rsp.url url_info = get_url_info(base_info.url) base_info.domain_id = url_info.get('domain_id') base_info.domain = url_info.get('domain') base_info.site = url_info.get('site') base_info.site_id = url_info.get('site_id') base_info.url_id = url_info.get('url_id') base_info.src_type = download_rsp.src_type return base_info
def _get_next_page_by_post_count(self): next_page = self.base_url url_info = get_url_info(self.base_url) query_info = url_query_decode(url_info.get('query')) post_param_str = query_info.get('HZPOST', None) if not post_param_str: return None post_param = base64_decode_json(post_param_str) next_post_param = self._build_count_query(post_param) if not next_post_param: return None else: query_info['HZPOST'] = base64_encode_json(next_post_param) next_page = next_page.split("?")[0] + "?" + urllib.urlencode( query_info) return next_page
def start_convert(self, page_parseinfo): link_attr = None try: if not page_parseinfo: return None extractor_crawl_info = page_parseinfo.crawl_info base_info = page_parseinfo.base_info extract_info = page_parseinfo.extract_info url = base_info.url url_info = get_url_info(url) data_extends = page_parseinfo.data_extends self.log.info('merge_start\turl:{}'.format(url)) #处理网页信息 webpage_merge = WebpageMerge(self.webpage_connection) webpage_obj = webpage_merge.merge_webpage( base_info, extractor_crawl_info, page_parseinfo.data_extends, page_parseinfo.parse_extends) # 将处理完的网页存回数据库 webpage_merge.save_webepage(base_info.domain, webpage_obj) # scheduler应为json格式字符串 scheduler_obj = {} try: scheduler_obj = json.loads(page_parseinfo.scheduler) except Exception as e: pass base_crawl_info = dict(extractor_crawl_info.__dict__) base_crawl_info.update(scheduler_obj) if base_crawl_info.get("status_code") != 0: return None link_merge = LinkMerge(self.link_connection, url_info, extract_info.links, self.log) link_attr = link_merge.merge_link_attr(base_info, extract_info, scheduler_obj, base_crawl_info, data_extends) # 将处理完的链接信息放回数据库 link_merge.save_link_attrs() self.log.info('merge url:{}'.format(base_info.url)) if base_info.src_type == "webpage": return None except Exception as e: self.log.error("url:{}".format(traceback.format_exc())) return link_attr
def select_webpage_by_url(self, url): self.log.info("select_webpage_by_url start\turl:{}".format(url)) url = url_encode(url) download_result = DownLoadRsp(url=url, download_time=int(time.time()), status=1, content_type='text/html', page_size=0, elapsed=100, content=None, redirect_url=url, src_type='webpage', http_code=0) try: query_item = {'url': url} domain = get_url_info(url).get('domain') result = self.mongo_client_web.find_first(domain, query_item) if result and (result.get('content')): download_result = self.get_download_rsp(result) except: self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format( url, traceback.format_exc())) self.log.info("select_webpage_by_url finish\turl:{}".format(url)) return download_result
def schedule_task(self, task): # dispatched_task_queue is the special queue, having the highest priority if not task: return None site_info = get_url_info(task.url) site_scheduler = self.get_site_scheduler(site_info) if site_scheduler: self.site_empty[site_scheduler.site_id] = False else: site_info = self.sites.get(site_info['site_id'], {}) if site_info == {}: return None site_info['avg_interval'] = 10 site_scheduler = SiteScheduler(site_info, self.conf['redis_tasks'], self.conf['log'], self.site_statistic, self.seed_statistic) self.site_schedulers[site_info['site_id']] = site_scheduler self.start_one_site_tasks(site_info['site']) self.log.info('schedule_task\turl:%s\tsite:%s\tnot_exit' % (task.url, site_info['site'])) return site_scheduler.schedule(task)
class Extractor(object): CHARSET_PATTERN = re.compile( '<meta[^>]*?(?:charset|CHARSET)=["\']?([a-zA-Z0-9\\-]+)["\']?[^>]*?>', re.I | re.S) ENCODING_PATTERN = re.compile( '<\?xml[^>]*? (?:encoding="[a-zA-Z0-9\\-]+")[^>]*?\?>', re.I | re.S) BR_PATTERN = re.compile("</\s*br\s*>", re.I | re.S) CONTENT_THRESHOLD = 200 def __init__(self, conf): self.log = conf['log'] self.log.info('Extractor load start') self.conf = conf self.config_handler = ConfigHandler(conf, self.log) self.plugin_handler = PluginHandler() self.log.info('Extractor load finish') def _get_charset(self, content): nodes = re.findall(self.CHARSET_PATTERN, content) if nodes: return nodes[0] else: return None # 对网页进行编码 def decode_body(self, body, link, download_type='simple'): if not body or isinstance(body, unicode): return body, None if download_type == 'phantom': # phantomjs抓取的网页一定是utf-8编码 try: return body.decode('utf-8'), 'utf-8' except: pass charset = self._get_charset(body) for try_charset in ['utf-8', 'gb18030', 'gbk', 'utf-16']: try: return body.decode(try_charset), try_charset except Exception as e: pass try: return body.decode(charset, errors='ignore'), charset except Exception as e: pass self.log.warning( "the page from {} can't not correct decode".format(link)) return None, None def pack_crawl_info(self, download_rsp): """ :param DownloadRsp: :return CrawlInfo: """ craw_info = CrawlInfo() craw_info.content = download_rsp.content craw_info.status_code = download_rsp.status craw_info.http_code = download_rsp.http_code craw_info.download_time = download_rsp.download_time craw_info.redirect_url = download_rsp.redirect_url craw_info.elapsed = download_rsp.elapsed craw_info.content_type = download_rsp.content_type craw_info.page_size = download_rsp.page_size return craw_info def pack_base_info(self, download_rsp): """ :param DonwloadRsp: :return BaseInfo: """ base_info = BaseInfo() base_info.url = download_rsp.url url_info = get_url_info(base_info.url) base_info.domain_id = url_info.get('domain_id') base_info.domain = url_info.get('domain') base_info.site = url_info.get('site') base_info.site_id = url_info.get('site_id') base_info.url_id = url_info.get('url_id') base_info.src_type = download_rsp.src_type return base_info def fix_links_info(self, links, custom_links, parser_config): """ :param Links: :param link_extend_rule: [{'rule':'', 'parser_id':1}} :return: """ rets = [] links_set = {} for link in links: links_set[link.url] = link for cl in custom_links: if links_set.has_key(cl.url): links_set[cl.url].parse_extends = cl.parse_extends links_set[cl.url].type = cl.type else: links_set[cl.url] = cl links = links_set.values() filter_rule = [] try: for ll in parser_config.urls_rule: if ll['$parse_method'] == u"filter": filter_rule.append(ll) except Exception, e: self.log.warning("filter_links failed, because {}".format(str(e))) for link in links: try: link.url = tools.url_encode(link.url) url_info = tools.get_url_info(link.url) link.domain = url_info.get('domain') link.site = url_info.get('site') link.site_id = url_info.get('site_id') link.domain_id = url_info.get('domain_id') link.url_id = url_info.get('url_id') if not link.type: for r in filter_rule: try: if r.get('$parse_rule') and re.findall( r['$parse_rule'], link.url): link_type = str(r['$link_type']) if not str.isdigit(link_type): link_type = LinkType.kUnknownLink else: link_type = int(link_type) parser_id = str(r['$parser_id']) if not str.isdigit(parser_id): parser_id = -1 else: parser_id = int(parser_id) link.type = link_type link.parse_extends = json.dumps( {'parser_id': parser_id}) break except Exception as e: pass if not link.type: link.type = LinkType.kUnknownLink rets.append(link) except Exception as e: self.log.warning( "fix_links_info of {} error, because of {}".format( link.url, e.message)) return rets