class Manager: """Core manager """ def __init__(self, ip, port, buff_size=1024, listen_num=5, thread_num=10): """ Initialization """ self.conf = ConfReader("manager.conf", default_conf) self.my_open = uopen if self.conf.get("buffer_output") == "no" else open self.links_file = self.conf.get("links_file") self.how_many_links_file = self.conf.get("how_many_links_file") # record all the links we have crawled self._links_track = self.my_open(self.how_many_links_file, "w+") self._links_track_lock = threading.Lock() #lock # how many links we send to crawler per request self._nsent = self.conf.get("links_to_crawler_NR") self.ip = ip self.port = port self.buff_size = buff_size self.listen_num = listen_num # socket initialization self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.bind((self.ip, self.port)) self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.sock.listen(self.listen_num) _blmftr_record = self.conf.get("blmftr_record") _start_fresh = False if self.conf.get("continuous_crawl") == "yes" else True # bloom_filter里面存放已经爬过的链接(或许没有爬成功) self.bloom_filter = Bloom_filter.Bloom_filter(10000, 0.001, #error rate filename=(_blmftr_record, -1), start_fresh=_start_fresh) self.bf_lock = threading.Lock() # lock to access bloom_filter self.prio_que = PriQueue(self.links_file) # manager's priority queue self.prio_que.get_links_from_disk() # initially get links from disk for l in self.prio_que.links: # add to bloom filter self.bloom_filter.add(l) self.prio_ful_threshold = self.conf.get("prio_ful_threshold") self.thread_list = [] #list of threads in manager self.thread_num = thread_num #how many thread we should start self.auto_speed = True self.speed_count = 0 # how many links to crawl from a website. Not using now speed = self.conf.get("speed") if speed != "auto": # the art of dynamic language self.speed_count = speed self.auto_speed = False self.focusing = False if self.conf.get("focus") == "no" else True # crawling_width take effect when self.focusing is False self.crawling_width = self.conf.get("crawling_width") self.search_engine_weed = self.conf.get("search_engine_weed") # MACRO,represent whether crawler want to send back links # or get links from here self.SEND = 0 self.REQUEST = 1 self.ASKFOCUSING = 2 def bf_acquire(self): self.bf_lock.acquire() def bf_release(self): self.bf_lock.release() def generate_search_engine_random_links(self): """ generate random search engine links (e.g., https://www.baidu.com/s?wd=xxxx)""" with open("conf/american-english", "r") as english_file: english_lines = english_file.readlines() with open("conf/idf-chinese.txt", "r") as chinese_file: chinese_lines = chinese_file.readlines() for i in range(1000): # 1000 link i = random.randint(0, len(english_lines)) j = random.randint(0, len(chinese_lines)) line1 = english_lines[i].strip() line2 = chinese_lines[j].strip() link = self.search_engine_weed + urllib.parse.quote_plus(line1 + "+" + line2) logger.info("Link generated: " + str(link)) self.prio_que.append_random(link) def handle_connection(self, conn, addr): """ handle connection with some crawler """ # set timeout for this connection, so failure of one crawler would # not waste resource of the manager conn.settimeout(60) method = None data_buf = [] try: method = self.get_conn_type(conn) if (method == self.SEND):#client want to send links to manager conn.sendall(b'OK') while True: data = conn.recv(self.buff_size) if data: data_buf.append(data) else: break #result dict is a dict: { link: {set of links} or 'FAIL'} data = b''.join(data_buf) _result_dict = pickle.loads(data) crawled_links = [] for key, value in _result_dict.items(): #没爬成功的,就是'FAIL'(我们对一个链接只爬一次,无论成功与否) self.bf_acquire() if (value == 'FAIL'): pass else: crawled_links.append(key) # limit links count(it's ok to exceed a little bit) if len(self.prio_que) < self.prio_ful_threshold: for sub_link in value: if sub_link not in self.bloom_filter: self.bloom_filter.add(sub_link) self.prio_que.append(sub_link) self.bf_release() # remove links of dominant domain so that links # from other domains have an opportunity to be # crawled if self.focusing: self.prio_que.remove_dominant() #write all the link to `self._links_track` with self._links_track_lock: for link in crawled_links: self._links_track.write(str(link) + "\n") elif (method == self.REQUEST): #crawler request some links conn.sendall(b'OK') if (self.prio_que.domains_nr() > self.crawling_width): self.focusing = True # we have enogh domains now, so now we do focused-crawling logger.info("[[Focused-crawling]] crawling_width[%d], domains_nr[%d]\n" % ( self.crawling_width, self.prio_que.domains_nr()), Logger.STDOUT) #crawler would ask whether or not to be focusing if self.get_conn_type(conn) == self.ASKFOCUSING: if self.focusing: conn.sendall(b'OK') else: conn.sendall(b'NO') # can only send back two bytes """ 假如prioQueue里面没有了就会返回一个空的lists """ data = None links_buffer = [] try: for _ in range(self._nsent): #一次发送self._nsent条链接 links_buffer.append(self.prio_que.get_by_addr(addr[0])) except EmptyPriQueue: pass except Exception as e: raise Exception("Exception:[%s] when getting links from PriQueue" % str(e)) #如果prio_que里面没有链接了,我们发送过去的就是一个空的list了 # #如果没有链接了,则从一些搜索引擎那里获取一些随即的链接 if not len(links_buffer): logger.info("Empty priority queue now. Trying to generate random links from search engine...") self.generate_search_engine_random_links() data = pickle.dumps(links_buffer) try: conn.sendall(data) except Exception as e: raise else: raise Exception("UNKNOWN CONNECTION TYPE") except Exception as e: #不能再向上抛异常了,因为这是多线程模型, #异常应该在本函数内处理 logger.info("Exception:[%s]" % str(e), Logger.STDERR) finally: conn.close() def get_conn_type(self, conn): """ get connection type(SEND or REQUEST) of this connection """ try: data = conn.recv(self.buff_size) if (data == b'SEND'): return self.SEND elif (data == b'REQUEST'): return self.REQUEST elif (data == b'FOCUSING?'): return self.ASKFOCUSING else: return None except Exception: raise def run(self): """ start manager """ logger.info("manager start running at: [%s]\n" % str(datetime.datetime.now()), Logger.STDOUT) while(True): #only want a fix number of thread in this program if (len(self.thread_list) > self.thread_num): for thread in self.thread_list: thread.join() self.thread_list = [] conn, addr = self.sock.accept() logger.info("Connection established: %s\n" % str(addr), Logger.STDOUT) t = threading.Thread(target=self.handle_connection, args=(conn, addr)) #t.daemon = True self.thread_list.append(t) t.start()
class Crawler(object): """ The crawler. Multiple threads would be started in method run() """ def __init__(self): """ Initialization """ self.conf = ConfReader("crawler.conf", default_conf) self.logger = Logger() self.db = None self.thread_pool_size = self.conf.get("thread_pool_size") self.left_ip = self.conf.get("manager_ip") self.left_port = self.conf.get("manager_port") self._buffer_size_threshold = self.conf.get("buffer_size_threshold") #how many links we should return when the caller call self.get_links() self._crawl_NR = self.conf.get("concurrent_crawl_NR") self.my_open = uopen if self.conf.get("buffer_output") == "no" else open self.content_path = self.conf.get("content_path") self.crawling_timeout = self.conf.get("crawling_timeout") self.DB_url = self.conf.get("DB_url") self.DB_user = self.conf.get("DB_user") self.DB_passwd = self.conf.get("DB_passwd") self.crawler_DB = self.conf.get("crawler_DB") self.crawler_table = self.conf.get("crawler_table") self.db = DBHandler(self.crawler_DB, self.DB_user,self.DB_passwd,self.DB_url) self.db.connect() # Mysql columns are case insensitive (contrary to Oracle) for search operations # and the default behavior can be changed while creating the table by specifying # the "BINARY" self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table + "` (" " `page_id` int(20) NOT NULL AUTO_INCREMENT," " `page_url` varchar(200) BINARY NOT NULL," " `domain_name` varchar(100) BINARY NOT NULL," " `sublinks` text," " `title` varchar(1024)," " `normal_content` text," " `emphasized_content` text," " `keywords` varchar(1024)," " `description` varchar(1024)," " `text` longtext," " `PR_score` double default 0.0," " `ad_NR` int default 0," " `tag` varchar(20) default null," #" `classify_attribute_1` ... #" `classify_attribute_2` ... " PRIMARY KEY (`page_id`)," " INDEX (`page_url`)" ")CHARSET=UTF8, ENGINE=InnoDB" ) self.db.update("truncate table " + self.crawler_table) # hold all the links to be sent back to manager self._result_dict = {} # used to hold all the links which are got from manager self._buffer = [] self.result_sender = NetworkHandler(self.left_ip, self.left_port) self.links_requester = NetworkHandler(self.left_ip, self.left_port) self.focusing = True # whether or not the crawling should do focus-crawling def get_links(self): """ used to get urls from manager. we use a buffer, so that we can get 50 links from manager, and then return 10 links with call to self.get_links() one by one. To do this, for example, user can adjust the 'concurrent_crawl_NR' setting in 'conf/crawler.conf' to 10 and 'links_to_crawler_NR' to 50. Note that we don't have to set any timeount here, because, after all, crawler have to get some links from manager side before it can continue """ # if there are not enough links in the buffer if len(self._buffer) < self._buffer_size_threshold: try: # manager would return links together with a # message(self.focusing), which tell the crawler whether it # should still be focused-crawling or not (self.focusing, links) = self.links_requester.request() self.logger.info("links_requester succeed request()") if not links: #return whatever in self._buffer tmp = self._buffer self._buffer = [] return tmp else: self._buffer.extend(links) #make sure that we don't exceed the limit nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer) else len(self._buffer)) tmp = [] for _ in range(nsent): tmp.append(self._buffer.pop()) return tmp except Exception: raise else: #make sure that we don't exceed the limit nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer) else len(self._buffer)) #we have enough links, so just return tmp = [] for _ in range(nsent): tmp.append(self._buffer.pop()) return tmp @staticmethod def req(url, **kwargs): page = requests.get(url, **kwargs) trytime = 1 while trytime < _exceeded_try and page.status_code != 200: page = requests.get(url, **kwargs) time.sleep(_pause_interval) trytime = trytime + 1 return page def get_web(self, resolved_url): """used to grab a web information and return a Response object.""" #fake as 'Baidu Spider'. Can also fake as GoogleBot, or YoudaoBot, #but this maybe easily detected due to ip-mismatch #NOTE: According to RFC 7230, HTTP header names are case-INsensitive headers={ 'Accept':'text/plain, text/html', #want only text #"accept-encoding":"gzip, deflate, sdch", #"accept-language":"en-US,en;q=0.8", #"Cache-Control":"max-age=0", #"Cookie":"timezone=480; I2KBRCK=1; cookiePolicy=accept", #"Host":"www.tandfonline.com", #"Proxy-Connection":"keep-alive", #"Referer":"https://www.tandfonline.com", #"Upgrade-Insecure-Requests":"1", #"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", "User-agent":"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", } try: response = Crawler.req(resolved_url, headers=headers, timeout=self.crawling_timeout) self.logger.info("Get response[%d]: [%s]" % (response.status_code, resolved_url)) #check whether we get a plain text response #note that key in `response.headers` is case insensitive if 'content-type' in response.headers: if 'text/' not in response.headers['content-type']: return None if (response.status_code == requests.codes.ok): #200 return response else: return None except Exception as e: self.logger.info("Fail to fetch page. Exception: %s, url:[%s]" % (str(e), resolved_url)) return None def run(self): """ main routine of crawler class @urls: used to hold the raw urls got from the left. """ while (True): try: urls = self.get_links() except Exception as e: self.logger.info("Cannot get urls. crawler sleep for 10 seconds.\n" "\tException:[%s]\n" % str(e)) time.sleep(10) #wait a little bit to see if thing would get better continue if not urls: self.logger.info("Empty urls from dns_resolver. Crawler will loop") time.sleep(10) continue #####DEBUG self.logger.info("GOT urls from manager: [") for u in urls: self.logger.info("\t" + u) self.logger.info(" ]") #####END # 爬取链接 with ThreadPoolExecutor(self.thread_pool_size) as pool: responses = pool.map(self.get_web, urls) #开始处理response,将得到的子内链与源链接组合在一起然后返回 for index, resp in enumerate(responses): origin = urls[index] if not resp: self._result_dict[origin] = "FAIL" else: try: # Note that we resp is already of type 'text/html' # Note that resp.text return unicode string outer_links, inner_links = self.extract_link(origin, resp.text) except Exception as e: self.logger.info(("Exception when extract_links:[%s]," "url:[%s]\n") % (str(e), origin)) continue self.logger.info("Finished extract_links()") outer_links = set(outer_links) inner_links = set(inner_links) if self.focusing: self.logger.info("crawler is FOCUSING now.\n") self._result_dict[origin] = self.trim_url_suffix(inner_links) else: self._result_dict[origin] = self.trim_url_suffix(outer_links) # resp.content return 'bytes' object try: self.dump_content(resp, origin) except Exception as e: self.logger.info(("Exception when dump_content():[%s]," "url:[%s]") % (str(e), origin)) traceback.print_exc() continue self.logger.info("Finished dump_content()") data = pickle.dumps(self._result_dict) try: self.result_sender.send(data) self.logger.info("successfully sent back to the left\n") except Exception as e: self.logger.info(("Fail sending to manager:[%s]\n" "unsent links:[%s]\n") % (str(e), str(self._result_dict))) finally: self._result_dict = {} def extract_link(self, origin_url, html): """This function is used for extract all links from the web. It would distinct the inner links and outer links. For inner links, it should add the header and delete the tag#, remove .css and javascript link""" html_text = etree.HTML(html) links = html_text.xpath('//*/a/@href') #all the links, relative or absolute origin_url = origin_url.strip() # get the url domain to define the website protocal, domain = self.get_protocal_domain(origin_url) #useless file pattern (something like xxx.jpg, xxx.mp4, xxx.css, xxx.pdf, etc) uf_pattern = re.compile(r'\.jpg$|\.png|\.xml|\.mp4|\.mp3|\.css|\.pdf|\.svg|\.gz|\.zip|\.rar|\.exe|\.tar') #unsupported protocal pattern(something like ftp://, sftp://, thunders://, etc) up_pattern = re.compile(r'^.{0,10}:') #we only support http/https protocal sp_pattern = re.compile(r'http://|https://') outer_link_lists = [] inner_link_lists = [] for element in links: element = element.strip() if re.match(sp_pattern, element): # begin with http/https #first check if this match those useless pattern if re.findall(uf_pattern, element): continue #check whether it's outer link or inner link test_protocal, test_domain = self.get_protocal_domain(element) if test_domain != domain: outer_link_lists.append(element.strip()) else: inner_link_lists.append(element.strip()) elif re.findall(uf_pattern, element): continue elif re.findall(up_pattern, element): continue else: if element.startswith('/'): link = protocal + '://' + domain + element else: link = protocal + '://' + domain + '/' + element inner_link_lists.append(link.strip()) return (outer_link_lists, inner_link_lists) def trim_url_suffix(self, urls): """ trim those urls with suffix `#xxxxx' or `?xxxx' NOTE that ALL URLS PASSED IN MUST BE VALID!!! """ def _trim_url_suffix(url): #make it reusable #tag link pattern return url.split('#')[0].split('?')[0] return list(map(_trim_url_suffix, urls)) def get_protocal_domain(self, url): """ return protocal and domain """ protocal, rest = urllib.parse.splittype(url) domain, url_suffix = urllib.parse.splithost(rest) return (protocal, domain) def dump_content(self, resp, origin_url): """ requests cannot detect web page encoding automatically(F**K!). response.encoding is from the html reponse header. If we want to convert all the content we want to utf8, we have to use `get_encodings_from_content; """ # resp.text is in unicode(type 'str') # resp.content is in unicode(type 'bytes') text = resp.text # requests get html page encoding from HTTP Response header, if the # Response header provide no info about encoding, then requests would # default to 'ISO-8859-1'. But most of the time we can detect the # encoding in html page content if(resp.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in resp.headers.get('Content-Type', '')): try: real_encoding = requests.utils.get_encodings_from_content(resp.text)[0] text = resp.content.decode(real_encoding, 'ignore') except Exception: text = resp.content.decode('utf-8', 'ignore') html_tree = etree.HTML(text) kws = html_tree.xpath('//*/meta[re:test(@name, "[Kk]eywords?")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"}) descs = html_tree.xpath('//*/meta[re:test(@name, "[Dd]escription")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"}) kw = kws[0] if kws else "" desc = descs[0] if descs else "" kw = kw.encode('utf-8', 'ignore') desc = desc.encode('utf-8', 'ignore') try: real_encoding = requests.utils.get_encodings_from_content(resp.text)[0] utf8_text = resp.content.decode(real_encoding, "ignore").encode('utf-8') except Exception: utf8_text = resp.content # requests的请求会出现重定向。比如 # http://bbs.people.com.cn/ #会被重定向到 # http://bbs1.people.com.cn/ #因此如果我们取 resp.url 作为爬取的 url 的话 #会导致最终数据库中看到 url 重复。因此这里我 #我们取传进来的origin_url (bbs, NOT bss1) # #page_url = bytes(resp.url, 'utf-8') page_url = origin_url _, domain_name = self.get_protocal_domain(resp.url) domain_name = bytes(domain_name, 'utf-8') titles = re.findall(rb'<title>(.*?)</title>', utf8_text) title = titles[0] if titles else b'' self.db.update("INSERT INTO " + self.crawler_table + "(`page_url`, `domain_name`," "`title`, `text`, `keywords`, `description`) " "VALUES (%s, %s, %s, %s, %s, %s);", (page_url, domain_name, title, utf8_text, kw, desc))
class Crawler(object): """ The crawler. Multiple threads would be started in method run() """ def __init__(self): """ Initialization """ self.conf = ConfReader("crawler.conf", default_conf) self.logger = Logger() self.db = None self.thread_pool_size = self.conf.get("thread_pool_size") self.left_ip = self.conf.get("manager_ip") self.left_port = self.conf.get("manager_port") self._buffer_size_threshold = self.conf.get("buffer_size_threshold") # how many links we should return when the caller call self.get_links() self._crawl_NR = self.conf.get("concurrent_crawl_NR") self.my_open = uopen if self.conf.get( "buffer_output") == "no" else open self.content_path = self.conf.get("content_path") self.crawling_timeout = self.conf.get("crawling_timeout") self.DB_url = self.conf.get("DB_url") self.DB_user = self.conf.get("DB_user") self.DB_passwd = self.conf.get("DB_passwd") self.crawler_DB = self.conf.get("crawler_DB") self.crawler_table = self.conf.get("crawler_table") self.db = DBHandler(self.crawler_DB, self.DB_user, self.DB_passwd, self.DB_url) self.db.connect() # Mysql columns are case insensitive (contrary to Oracle) for search operations # and the default behavior can be changed while creating the table by specifying # the "BINARY" self.db.update("CREATE TABLE IF NOT EXISTS `" + self.crawler_table + "` (" " `page_id` int(20) NOT NULL AUTO_INCREMENT," " `page_url` varchar(200) BINARY NOT NULL," " `domain_name` varchar(100) BINARY NOT NULL," " `inner_links` text," " `outer_links` text," " `title` varchar(1024)," " `normal_content` text," " `emphasized_content` text," " `keywords` varchar(1024)," " `description` varchar(1024)," " `text` longtext," " `PR_score` double default 0.0," " `ad_NR` int default 0," " `tag1` varchar(20) default null," " `tag2` varchar(20) default null," " `tag3` varchar(20) default null," " INDEX (`page_url`)," " PRIMARY KEY (`page_id`)" ")CHARSET=UTF8, ENGINE=InnoDB") self.db.update("truncate table " + self.crawler_table) # hold all the links to be sent back to manager self._result_dict = {} # used to hold all the links which are got from manager self._buffer = [] self.result_sender = NetworkHandler(self.left_ip, self.left_port) self.links_requester = NetworkHandler(self.left_ip, self.left_port) self.focusing = True # whether or not the crawling should do focus-crawling def get_links(self): """ used to get urls from manager. we use a buffer, so that we can get 50 links from manager, and then return 10 links with call to self.get_links() one by one. To do this, for example, user can adjust the 'concurrent_crawl_NR' setting in 'conf/crawler.conf' to 10 and 'links_to_crawler_NR' to 50. Note that we don't have to set any timeount here, because, after all, crawler have to get some links from manager side before it can continue """ # if there are not enough links in the buffer if len(self._buffer) < self._buffer_size_threshold: try: # manager would return links together with a # message(self.focusing), which tell the crawler whether it # should still be focused-crawling or not (self.focusing, links) = self.links_requester.request() self.logger.info("links_requester succeed request()") if not links: # return whatever in self._buffer tmp = self._buffer self._buffer = [] return tmp else: self._buffer.extend(links) # make sure that we don't exceed the limit nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer) else len( self._buffer)) tmp = [] for _ in range(nsent): tmp.append(self._buffer.pop()) return tmp except Exception: raise else: # make sure that we don't exceed the limit nsent = (self._crawl_NR if self._crawl_NR <= len(self._buffer) else len(self._buffer)) # we have enough links, so just return tmp = [] for _ in range(nsent): tmp.append(self._buffer.pop()) return tmp @staticmethod def req(url, **kwargs): page = requests.get(url, **kwargs) trytime = 1 while trytime < _exceeded_try and page.status_code != 200: page = requests.get(url, **kwargs) time.sleep(_pause_interval) trytime = trytime + 1 return page def get_web(self, resolved_url): """used to grab a web information and return a Response object.""" # fake as 'Baidu Spider'. Can also fake as GoogleBot, or YoudaoBot, # but this maybe easily detected due to ip-mismatch # NOTE: According to RFC 7230, HTTP header names are case-INsensitive headers = { 'Accept': 'text/plain, text/html', #want only text # "accept-encoding":"gzip, deflate, sdch", # "accept-language":"en-US,en;q=0.8", # "Cache-Control":"max-age=0", # "Cookie":"timezone=480; I2KBRCK=1; cookiePolicy=accept", # "Host":"www.tandfonline.com", # "Proxy-Connection":"keep-alive", # "Referer":"https://www.tandfonline.com", # "Upgrade-Insecure-Requests":"1", # "User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", "User-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", } try: response = Crawler.req(resolved_url, headers=headers, timeout=self.crawling_timeout) self.logger.info("Get response[%d]: [%s]" % (response.status_code, resolved_url)) # check whether we get a plain text response # note that key in `response.headers` is case insensitive if 'content-type' in response.headers: if 'text/' not in response.headers['content-type']: return None if response.status_code == requests.codes.ok: # 200 return response else: return None except Exception as e: self.logger.info("Fail to fetch page. Exception: %s, url:[%s]" % (str(e), resolved_url)) return None def run(self): """ main routine of crawler class @urls: used to hold the raw urls got from the left. """ while True: try: urls = self.get_links() except Exception as e: self.logger.info( "Cannot get urls. Crawler sleep for 10 seconds.\n" "\tException:[%s]\n" % str(e)) time.sleep( 10) # wait a little bit to see if thing would get better continue if not urls: self.logger.info( "Empty urls from dns_resolver. Crawler will loop") time.sleep(10) continue #####DEBUG self.logger.info("GOT urls from manager: [") for u in urls: self.logger.info("\t" + u) self.logger.info(" ]") #####END # 爬取链接 with ThreadPoolExecutor(self.thread_pool_size) as pool: responses = pool.map(self.get_web, urls) # 开始处理response,将得到的子内链与源链接组合在一起然后返回 for index, resp in enumerate(responses): origin = urls[index] if not resp: self._result_dict[origin] = "FAIL" else: try: # Note that we resp is already of type 'text/html' # Note that resp.text return unicode string outer_links, inner_links = self.extract_link( origin, resp.text) except Exception as e: self.logger.info(("Exception when extract_links:[%s]," "url:[%s]\n") % (str(e), origin)) continue self.logger.info("Finished extract_links()") outer_links = set(outer_links) inner_links = set(inner_links) if self.focusing: self.logger.info("crawler is FOCUSING now.\n") self._result_dict[origin] = self.trim_url_suffix( inner_links) else: self._result_dict[origin] = self.trim_url_suffix( outer_links) # resp.content return 'bytes' object try: self.dump_content(resp, origin, inner_links, outer_links) except Exception as e: self.logger.info(("Exception when dump_content():[%s]," "url:[%s]") % (str(e), origin)) traceback.print_exc() continue self.logger.info("Finished dump_content()") data = pickle.dumps(self._result_dict) try: self.result_sender.send(data) self.logger.info("successfully sent back to the left\n") except Exception as e: self.logger.info( ("Fail sending to manager:[%s]\n" "unsent links:[%s]\n") % (str(e), str(self._result_dict))) finally: self._result_dict = {} def extract_link(self, origin_url, html): """This function is used for extract all links from the web. It would distinct the inner links and outer links. For inner links, it should add the header and delete the tag#, remove .css and javascript link""" html_text = etree.HTML(html) links = html_text.xpath( '//*/a/@href') #all the links, relative or absolute origin_url = origin_url.strip() # get the url domain to define the website protocal, domain = self.get_protocal_domain(origin_url) # useless file pattern (something like xxx.jpg, xxx.mp4, xxx.css, xxx.pdf, etc) uf_pattern = re.compile( r'\.jpg$|\.png|\.xml|\.mp4|\.mp3|\.css|\.pdf|\.svg|\.gz|\.zip|\.rar|\.exe|\.tar' ) # unsupported protocal pattern(something like ftp://, sftp://, thunders://, etc) up_pattern = re.compile(r'^.{0,10}:') # we only support http/https protocal sp_pattern = re.compile(r'http://|https://') outer_link_lists = [] inner_link_lists = [] for element in links: element = element.strip() if re.match(sp_pattern, element): # begin with http/https # first check if this match those useless pattern if re.findall(uf_pattern, element): continue # check whether it's outer link or inner link test_protocal, test_domain = self.get_protocal_domain(element) if test_domain != domain: outer_link_lists.append(element.strip()) else: inner_link_lists.append(element.strip()) elif re.findall(uf_pattern, element): continue elif re.findall(up_pattern, element): continue else: if element.startswith('/'): link = protocal + '://' + domain + element else: link = protocal + '://' + domain + '/' + element inner_link_lists.append(link.strip()) return outer_link_lists, inner_link_lists def trim_url_suffix(self, urls): """ trim those urls with suffix `#xxxxx' or `?xxxx' NOTE that ALL URLS PASSED IN MUST BE VALID!!! """ def _trim_url_suffix(url): # make it reusable # tag link pattern return url.split('#')[0].split('?')[0] return list(map(_trim_url_suffix, urls)) def get_protocal_domain(self, url): """ return protocal and domain """ protocal, rest = urllib.parse.splittype(url) domain, url_suffix = urllib.parse.splithost(rest) return protocal, domain def dump_content(self, resp, origin_url, inner_links, outer_links): """ requests cannot detect web page encoding automatically(F**K!). response.encoding is from the html reponse header. If we want to convert all the content we want to utf8, we have to use `get_encodings_from_content; """ # resp.text is in unicode(type 'str') # resp.content is in unicode(type 'bytes') text = resp.text # requests get html page encoding from HTTP Response header, if the # Response header provide no info about encoding, then requests would # default to 'ISO-8859-1'. But most of the time we can detect the # encoding in html page content if resp.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in resp.headers.get( 'Content-Type', ''): try: real_encoding = requests.utils.get_encodings_from_content( resp.text)[0] text = resp.content.decode(real_encoding, 'ignore') except Exception: text = resp.content.decode('utf-8', 'ignore') html_tree = etree.HTML(text) kws = html_tree.xpath( '//*/meta[re:test(@name, "[Kk]eywords?")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"}) descs = html_tree.xpath( '//*/meta[re:test(@name, "[Dd]escription")]/@content', namespaces={'re': "http://exslt.org/regular-expressions"}) kw = kws[0] if kws else "" desc = descs[0] if descs else "" kw = kw.encode('utf-8', 'ignore') desc = desc.encode('utf-8', 'ignore') try: real_encoding = requests.utils.get_encodings_from_content( resp.text)[0] utf8_text = resp.content.decode(real_encoding, "ignore").encode('utf-8') except Exception: utf8_text = resp.content # requests的请求会出现重定向。比如 # http://bbs.people.com.cn/ # 会被重定向到 # http://bbs1.people.com.cn/ # 因此如果我们取 resp.url 作为爬取的 url 的话 # 会导致最终数据库中看到 url 重复。因此这里我 # 我们取传进来的origin_url (bbs, NOT bss1) # # page_url = bytes(resp.url, 'utf-8') page_url = origin_url _, domain_name = self.get_protocal_domain(resp.url) domain_name = bytes(domain_name, 'utf-8') titles = re.findall(rb'<title>(.*?)</title>', utf8_text) title = titles[0] if titles else b'' inner_links = ";".join(inner_links) outer_links = ";".join(outer_links) self.db.update( "INSERT INTO " + self.crawler_table + "(`page_url`, `domain_name`," "`inner_links`,`outer_links`,`title`, `text`, `keywords`, `description`) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s);", (page_url, domain_name, inner_links, outer_links, title, utf8_text, kw, desc))