示例#1
0
    def pulverizeData(self):
        """
        Pulverizes the original raw data building up
        a map of links to be visited as well as cleaned up data
        """
        self.anchors = set()

        if not self.rawData:
            return
        text_content = None
        soup = BeautifulSoup(self.rawData)
        # Get rid of all script style and link rules
        for elem in soup.findAll(["script", "link", "style"]):
            elem.extract()

        text_content = str(soup.html.body)
        # Clean all unworthy characters in the text document
        cleaner = HtmlCleaner()
        self.content = cleaner.clean_text(text_content)
        # Ensure that the current uri is not crawled again by putting it
        # into the list of crawled uri
        PhrasetankSink.addSeenURI(self.currentUri)
        # Extract all links on the document
        links = soup.findAll("a", dict(href=True))

        for link in links:
            uri = PhrasetankRule.isCrawlable(self.baseUri, link["href"])
            if uri is not None:
                # Check if we had seen this link before
                if not PhrasetankSink.hasSeenURI(uri):
                    # Add this for later crawling
                    self.anchors.add(uri)
示例#2
0
 def run(self):
     """
     Start running the producer
     """
     logMessage('START','PhrasetankProducer '+self.name+' is starting to crawl '+self.uri+' ...')
     
     while True:
         
         uri = None
         
         try:
             uri = self.uriTank.pop()
         except:
             pass
         
         if not uri:
             break
         # make sure that we have not visited this link before
         if PhrasetankSink.hasSeenURI(uri):
             #Log the seen status
             logMessage('SEEN',uri+' has been visited before now...')
             continue
         #Ok we are ready to visit the link
         self.fetchURIData(uri)