class SCSpider(Process): """SCSpider:""" Num = 0 def __init__(self, *sc_queues, **kwargs): Process.__init__(self) SCSpider.Num += 1 self.pnum = SCSpider.Num if sc_queues: self.scqs = sc_queues else: self.scqs = list() self.due = DUEUnit() #The self.headers keeps the HTTP headers Agent information for Masking the Crawler self.headers = { 'User-Agent' : kwargs.pop("spider_spoof_id", None) } if self.headers['User-Agent'] == None: self.headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' } self.kill_evt = kwargs.pop("kill_evt", multiprocessing.Event().clear()) self.urls_l = [ kwargs.pop("seed", None) ] self.xtrees_q = kwargs.pop("xtrees_q", Queue()) #Use external Queue only for Interprocess Communication if any #ext_due_q is a Queue of URL Links for an External DUE-Unit self.ext_url_q = kwargs.pop("ext_due_q", None) self.base_url_drop_none = kwargs.pop("base_url_drop_none", True) #urls_number_stop : Stop in a Default Values (if none given from user) for Politeness and because there is no point to have more samples of this site (I think) self.urls_number = kwargs.pop("urls_number_stop", 10000) self.webpg_vect_tu = kwargs.pop("webpg_vect_tu", None) self.save_path = kwargs.pop("save_path", None) def run(self): """SCSpider's main function""" #Use the netloc (network locator) of the seed url as the base URL that this spider is working on url = urlparse(self.urls_l[0]) hash = hashlib.md5() hash.update(url.scheme + "://" + url.netloc) hashkey = hash.hexdigest() self.due.setBase(url.scheme + "://" +url.netloc) #Define 10000 Green Threads for fetching and lets see how it goes #fetchers_p = Pool(10) #eventlet.GreenPool(10000) fetchers_p = eventlet.GreenPool(100) #A thread is constantly checking the DUE seen dictionary is big enough to be saved on disk disk_keeper_thrd = Thread(target=self.savedue) disk_keeper_thrd.start() #Start a thread that will Analyse the pages for further process. In case none process uses this Analysis the thread will just not start at all if self.webpg_vect_tu: scvectgen_t = SCVectGen(self.webpg_vect_tu, self.xtrees_q, kill_evt=self.kill_evt, save_path=self.save_path) scvectgen_t.start() #Counter for the URLS that have been Followed by the Crawler scanned_urls = 0 #From this line and below the loop this Process is starting while True: #Termination Condition of this spider: They can be external signal(s), user defined conditions or when spider didn't finds any URLs related to the Crawling target #Terminate - External Signal or Functional problems occur while crawling if self.kill_evt.is_set(): print("SCSpider Process (PID = %s - PCN = %s): Terminated" % (self.pid, self.pnum)) SCSpider.Num -= 1 break #Terminate - No more Urls to follow if self.urls_l == []: print("SCSpider Process (PID = %s - PCN = %s): Terminated - No more URL links to follow" % (self.pid, self.pnum)) SCSpider.Num -= 1 self.due.savetofile() self.kill_evt.set() break #Terminate - Some User condition reached if scanned_urls > self.urls_number: print("SCSpider Process (PID = %s - PCN = %s): Terminated - User Condition: Stop On %d Pages (%d have been followed) "\ % (self.pid, self.pnum, self.urls_number, scanned_urls)) SCSpider.Num -= 1 self.due.savetofile() self.kill_evt.set() break else: print("SCANNED URLS: %d" % scanned_urls) scanned_urls += len(self.urls_l) #Get one of the URLs which an other SCSpider has left to the ext_due_p and append it to the urls_l if self.ext_url_q != None: ext_url = self.ext_url_q.get(self.due.base_url['hashkey'],2) if ext_url: self.urls_l.append(ext_url) tmp_urls_l = list() #SHOULD BE HERE #Start Processing WebPages (in fact sockets to them) which a Pool of GreenThreads is harvesting Asynchronously #for page_soc in fetchers_p.imap_unordered(ffetchsrc, self.urls_l): for xhtml in fetchers_p.imap(self.fetchsrc, self.urls_l): # ~~~~~~~~ Maybe the following code can be wrapped up from a few sub-Processes or Threads ~~~~~~~~ if xhtml[0] == None: print("SPIDER %d of %d BASE %s" % (self.pnum, SCSpider.Num, self.due.base_url['url'])) print("Empty Page : %s" % xhtml[1]) continue else: xhtml_s = xhtml[0] #Find the proper Encoding of the byte_string that urlopen has returned and decoded to it before you pass it to the lxml.html parser if xhtml[1]: if xhtml[1] != 'UTF-8' and xhtml[1] != 'utf-8': print(xhtml[1]) #decode to the proper string Encoding. The incoming from urlopen() is string is raw_byte string #xhtml_s = xhtml_s.decode(xhtml[1], 'ignore') #'ignore': error handling when invalid UTF (or other) encoding characters are occurring else: #try to figure out weather or not the data are coming with utf-8 encoding.... #Maybe I don't need this because encoding is retrieved from urlopen() and not the lxml library #OR maybe I do need it because the encoding information are derived from HTTP headers Response #utf_s = UnicodeDammit(xhtml_s, isHTML=True) #if utf_s.unicode: # xhtml_s = utf_s.unicode #Be Careful of what you get pass #Parse the XHTML Source fetched by from the GreenThreads xhtml_t = self.parsetoXtree(xhtml_s, clean_xhtml=True) #While this Process will try to extract URLs, give the tree for PARALLEL Processing from the SCVetGen Thread #ELEMENT TREES FROM LXML IT SUPPOSED TO BE THREAD SAFE xhtml_t['charset'] = xhtml[1] xhtml_t['url_req'] = xhtml[2] xhtml_t['url_resp'] = xhtml[3] xhtml_t['base_url'] = self.due.base_url['url'] #print("IN") self.xtrees_q.put(xhtml_t) #print("IN DONE") #Get every URL Link by performing etree traversal evaluate them and perform UST and them put them either #to the proper DUEUnit (internal or external) or give them to the Green-Threaded (XHTML) Source Fetcher(s) xtree = xhtml_t['xtree'] if xtree: xhtml_troot = xtree.getroot() else: continue if xhtml_troot is None: #if there is not any html etree root skip the URL processing because there is None continue count = 0 for link in xhtml_troot.iterlinks(): if link[1] == 'href': if link[2].find(".css") == -1 or link[2].find(".jpg") == -1 or link[2].find(".gif") == -1 or link[2].find(".png") == -1 or link[2].find("javascript:0")\ or link[2].find(".ico") == -1: parsed_u = urlparse(link[2]) prsd_url = str(parsed_u.scheme + "://" + parsed_u.netloc) if prsd_url == self.due.base_url['url']: seen = self.ust(link[2]) if not seen: count += 1 if self.due.seen_len() < 30: pass #print("SPIDER %d APPEND_LINKS %s SEEN-LIST %s" % (self.pnum, count, self.due.seen_len())) tmp_urls_l.append(link[2]) #else: means discarding this previously seen URL links else: #If the Base_urls is not the one this SCSpider is working on #try to pass it the other SCSpider Processes if self.ext_url_q != None: #print("Sendto EXTERNAL LINKS: %s" % link[2]) #if self.ext_url_q.full(self.due.base_url['hashkey']) == True: #print("FULL Queue Botle neck\n") #elif self.ext_url_q.full(self.due.base_url['hashkey']) == None: # pass #print("NONE\n") self.ext_url_q.put(link[2]) elif self.base_url_drop_none: seen = self.ust(link[2]) if seen: tmp_urls_l.append(link[2]) #Now give the new URLs List back to the Fetcher GreenThreads del self.urls_l #for Preventing Memory Leakage remove it if has no effect but delay of the inevitable self.urls_l = tmp_urls_l #If this Process has to be terminated wait for the Disk_keeper Thread to finish its job and join self.due.acquire() #WAKE-UP disk_keeper thread in case still waiting self.due.notify_all() self.due.release() disk_keeper_thrd.join() #Terminate scvectgen_t only in case it has previously been started to run if self.webpg_vect_tu: scvectgen_t.join() def parsetoXtree(self, xhtml_s, clean_xhtml=False): if clean_xhtml: cleaner = Cleaner( scripts=True, javascript=True, comments=True, style=True,\ links=True, meta=True, page_structure=False, processing_instructions=True,\ embedded=True, annoying_tags=True, remove_unknown_tags=True )#meta=False because we need MetaInfo try: xhtml_s = cleaner.clean_html(xhtml_s) except: pass #The HTML Parsers with and without recover mode but the capability to download the Proper DTD always ON #In case the lxml.html.parser will dispatched to sub-processes or threads then #the HTMLParser(s) should be defined within these sub-processes or threads htmlparser = lxml.html.HTMLParser(recover=False, no_network=False) htmlparser_rcv = lxml.html.HTMLParser(recover=True, no_network=False) #Parse the XHTML Source parsing_errors = list() try: xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser, base_url=self.due.base_url['url']) except: #except ValueError, error: #except lxml.etree.XMLSyntaxError, error: #print(xhtml_s) pass #print("PARSE ERROR (no recovery mode): %s" % error) #parsing_errors.append(error) try: xhtml_t = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser_rcv, base_url=self.due.base_url['url']) #StringIO(xhtml_s) except: print("PARSE ERROR (recivery mode): %s" % error) parsing_errors.append(error) try: print('DA ZOUP') xhtml_t = soup.parse(xhtml_s) #StringIO(xhtml_s) except: print("F****D-UP PAGE") parsing_errors.append("BeautifullSoup Failed") return {'xtree' : None, 'parsing_errors' : parsing_errors} #Get the root Element and make the links absolute xhtml_troot = xhtml_t.getroot() try: xhtml_troot.make_links_absolute(self.due.base_url['url'], resolve_base_href=True) except: return {'xtree' : None, 'parsing_errors' : parsing_errors} for i in xhtml_t.iterlinks(): pass return {'xtree' : xhtml_t, 'parsing_errors' : parsing_errors} def fetch(self, url): #print("IN FETCH: " + str(self.headers)) #print(url) rq = urllib2.Request(url, headers=self.headers) socket = urllib2.urlopen(rq) return socket def fetchsrc(self, url_req): htmlsrc = None socket = None charset = None url_resp = None try: rq = urllib2.Request(url_req, headers=self.headers) socket = urllib2.urlopen(rq) htmlsrc = socket.read() charset = socket.info().getparam('charset') url_resp = socket.geturl() socket.close() except HTTPError as e: print("HTTP ERROR: %s" % e.read()) #Return a tuple of the HTML source, the character encoding of this source, and its URL return (htmlsrc, charset, url_req, url_resp) def savedue(self): while not self.kill_evt.is_set(): self.due.acquire() while self.due.seen_len() < 100: #This will force the thread to stop in case a global stop signal is given if self.kill_evt.is_set(): return #In case Seen Dictionary is still small Wait (i.e. Sleep) self.due.wait() if not self.due.savetofile(): print("FILE NOT SAVED - HALT") self.kill_evt.set() return #self.due.notify_all() self.due.release() def ust(self, link): self.due.acquire() seen = self.due.ust(link) self.due.notify_all() self.due.release() return seen