def pulverizeData(self): """ Pulverizes the original raw data building up a map of links to be visited as well as cleaned up data """ self.anchors = set() if not self.rawData: return text_content = None soup = BeautifulSoup(self.rawData) # Get rid of all script style and link rules for elem in soup.findAll(["script", "link", "style"]): elem.extract() text_content = str(soup.html.body) # Clean all unworthy characters in the text document cleaner = HtmlCleaner() self.content = cleaner.clean_text(text_content) # Ensure that the current uri is not crawled again by putting it # into the list of crawled uri PhrasetankSink.addSeenURI(self.currentUri) # Extract all links on the document links = soup.findAll("a", dict(href=True)) for link in links: uri = PhrasetankRule.isCrawlable(self.baseUri, link["href"]) if uri is not None: # Check if we had seen this link before if not PhrasetankSink.hasSeenURI(uri): # Add this for later crawling self.anchors.add(uri)
def fetchURIData(self,uri): """ Visit the given uri and fetch its page content """ contentsource = None print "Visiting page at "+uri try: client = HTTPClient() response = client.fetch(uri) contentsource = response.body or None except: pass if contentsource: # Send the content source to the pulverizer for processing pulverize = PhrasetankPulverizer() pulverize.setBaseURI(self.uri) pulverize.setCurrentURI(uri) pulverize.setRawData(contentsource) #process the data pulverize.pulverizeData() links = pulverize.getLinks() or [] content = pulverize.getTextContent() or '' if links and len(links): #self.uriTank.extend(links) for l in links: self.uriTank.add(l) if content and len(content): # call the consumer for further processing of the data content consumer = PhrasetankConsumer() consumer.setDataContent(content) consumer.setProducerName(self.name) consumer.start() else: # Perhaps the link has an issue we need to not visit it again PhrasetankSink.addSeenURI(uri) logMessage("ERROR",'Failed to read content source for '+uri)
def run(self): """ Start running the producer """ logMessage('START','PhrasetankProducer '+self.name+' is starting to crawl '+self.uri+' ...') while True: uri = None try: uri = self.uriTank.pop() except: pass if not uri: break # make sure that we have not visited this link before if PhrasetankSink.hasSeenURI(uri): #Log the seen status logMessage('SEEN',uri+' has been visited before now...') continue #Ok we are ready to visit the link self.fetchURIData(uri)