def _load_url(self, url_u, encoding=None): # word hit list obsolete self.wordhitview.clear_words() # set text in textview ret = fetcher.fetch(url_u) if not encoding: encoding = decoder.detect_encoding(ret.txt_byte) txt_u = decoder.decode(ret.txt_byte, encoding) txt_u = unmarkup.unwiki(txt_u) or unmarkup.unhtml(txt_u) self.text = word.Text() self.text.set_from_txt_u(txt_u) self.textview.set_text(self.text, encoding, url_u)
def find_next(url_u, web, handler=None): url_byte = decoder.encode(url_u) if handler: io.output("Running handler with page: %s" % url_byte) handler(url_u) io.output("Spidering page: %s" % url_byte) txt_byte = get_page(url_byte) candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte) encoding = decoder.detect_encoding(txt_byte) chosen_u = pick_url(candidates_byte, encoding=encoding) return chosen_u
def set_from_file(self, filename, encoding=None): if not encoding: encoding = decoder.detect_encoding(filename=filename) txt_u = codecs.open(filename, "rU", encoding).read() self.txt_u = txt_u return encoding
print import sys text = Text() text.set_from_file(sys.argv[1]) text.do_index() data = text.by_freq() print_by_freq(data) import fetcher url_u = u"http://www.dagbladet.no" text = Text() ret = fetcher.fetch(url_u) encoding = decoder.detect_encoding(ret.txt_byte) text.set_from_txt_byte(ret.txt_byte, encoding, untag=True) text.do_index() data = text.by_freq() print_by_freq(data) print (encoding) sys.exit() def out(dct, f): ws = dct.keys() ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower())) s = "" for w in ws: s += "%-6.6s %s\n" % (dct[w].len_hits(), w) codecs.open(f, "w", "utf-8").write(s)
def set_from_file(self, filename, encoding=None): if not encoding: encoding = decoder.detect_encoding(filename=filename) txt_u = codecs.open(filename, 'rU', encoding).read() self.txt_u = txt_u return encoding
for h in w.get_hits(): print h.get_pos(), print import sys text = Text() text.set_from_file(sys.argv[1]) text.do_index() data = text.by_freq() print_by_freq(data) import fetcher url_u = u'http://www.dagbladet.no' text = Text() ret = fetcher.fetch(url_u) encoding = decoder.detect_encoding(ret.txt_byte) text.set_from_txt_byte(ret.txt_byte, encoding, untag=True) text.do_index() data = text.by_freq() print_by_freq(data) print(encoding) sys.exit() def out(dct, f): ws = dct.keys() ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower())) s = '' for w in ws: s += '%-6.6s %s\n' % (dct[w].len_hits(), w) codecs.open(f, 'w', 'utf-8').write(s)