Exemplo n.º 1
0
            message = self.urlCorpus.makeMessage(url_key, fake_message_string)
            self.urlCorpus.addMessage(message)
        else:
            fake_message_string = cached_message.as_string()

        msg = message_from_string(fake_message_string)

        # We don't want to do full header tokenising, as this is
        # optimised for messages, not webpages, so we just do the
        # basic stuff.
        bht = options["Tokenizer", "basic_header_tokenize"]
        bhto = options["Tokenizer", "basic_header_tokenize_only"]
        options["Tokenizer", "basic_header_tokenize"] = True
        options["Tokenizer", "basic_header_tokenize_only"] = True

        tokens = Tokenizer().tokenize(msg)
        pf = options["URLRetriever", "x-web_prefix"]
        tokens = ["%s%s" % (pf, tok) for tok in tokens]

        # Undo the changes
        options["Tokenizer", "basic_header_tokenize"] = bht
        options["Tokenizer", "basic_header_tokenize_only"] = bhto
        return tokens

    def _base_url(self, url):
        # To try and speed things up, and to avoid following
        # unique URLS, we convert the URL to as basic a form
        # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
        # would become http://massey.ac.nz and http://id.example.com
        # would become http://example.com
        url += '/'