def parse(self): """Parses page content. Unicod'ification and metadata and link extraction happens here. """ # Converting to unicode unicoder = UnicodeBugger(self.contents, [self.encoding]) data = unicoder.convert() self.encoding = unicoder.encoding # extracting link and meta information parser = LinkExtractor(data) parser.parse() self.parser = parser if parser.base: self.base = self.sanitizeURL(parser.base) self.follow = parser.follow self.index = parser.index # Prepare to get all the links from the page base_url = URL(self.base) for u in parser.links: try: l = URL(u) # FIXME this is where dynamic and fragments should be removed l.fragment = None l.query = None if l.isRelative(): self.links.add(str(base_url + l)) else: self.links.add(str(l)) except NotSupportedSchemeException, InvalidURLException: # We just blindly ignore unsupported and invalid URLs pass
def sanitizeURL(self,url): """Does URL normalization and sanitization. Remove query and fragments from a URL. """ # FIXME should be part of urltools u = URL(url) u.query = None u.fragment = None return str(u)