예제 #1
0
    def parse(self):
        """Parses page content.
        
        Unicod'ification and metadata and link extraction happens here.
        """
        # Converting to unicode
        unicoder = UnicodeBugger(self.contents, [self.encoding])
        data = unicoder.convert()
        self.encoding = unicoder.encoding
        
        # extracting link and meta information
        parser = LinkExtractor(data)
        parser.parse()
        self.parser = parser
        if parser.base:
            self.base = self.sanitizeURL(parser.base)
        self.follow = parser.follow
        self.index = parser.index

        # Prepare to get all the links from the page
        base_url = URL(self.base)
        for u in parser.links:
            try:
                l = URL(u)
                # FIXME this is where dynamic and fragments should be removed
                l.fragment = None
                l.query = None
                if l.isRelative():
                    self.links.add(str(base_url + l))
                else:
                    self.links.add(str(l))
            except NotSupportedSchemeException, InvalidURLException:
                # We just blindly ignore unsupported and invalid URLs
                pass
예제 #2
0
    def sanitizeURL(self,url):
        """Does URL normalization and sanitization.

        Remove query and fragments from a URL.
        """
        # FIXME should be part of urltools
        u = URL(url)
        u.query = None
        u.fragment = None
        return str(u)