Пример #1
0
 def _load_url(self, url_u, encoding=None):
     # word hit list obsolete
     self.wordhitview.clear_words()
     # set text in textview
     ret = fetcher.fetch(url_u)
     if not encoding:
         encoding = decoder.detect_encoding(ret.txt_byte)
     txt_u = decoder.decode(ret.txt_byte, encoding)
     txt_u = unmarkup.unwiki(txt_u) or unmarkup.unhtml(txt_u)
     self.text = word.Text()
     self.text.set_from_txt_u(txt_u)
     self.textview.set_text(self.text, encoding, url_u)
Пример #2
0
 def _load_url(self, url_u, encoding=None):
     # word hit list obsolete
     self.wordhitview.clear_words()
     # set text in textview
     ret = fetcher.fetch(url_u)
     if not encoding:
         encoding = decoder.detect_encoding(ret.txt_byte)
     txt_u = decoder.decode(ret.txt_byte, encoding)
     txt_u = unmarkup.unwiki(txt_u) or unmarkup.unhtml(txt_u)
     self.text = word.Text()
     self.text.set_from_txt_u(txt_u)
     self.textview.set_text(self.text, encoding, url_u)
Пример #3
0
def find_next(url_u, web, handler=None):
    url_byte = decoder.encode(url_u)

    if handler:
        io.output("Running handler with page: %s" % url_byte)
        handler(url_u)

    io.output("Spidering page: %s" % url_byte)
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u
Пример #4
0
def find_next(url_u, web, handler=None):
    url_byte = decoder.encode(url_u)

    if handler:
        io.output("Running handler with page: %s" % url_byte)
        handler(url_u)

    io.output("Spidering page: %s" % url_byte)
    txt_byte = get_page(url_byte)

    candidates_byte = find_urls_in_page(web, txt_byte, url_u, url_byte)
    encoding = decoder.detect_encoding(txt_byte)
    chosen_u = pick_url(candidates_byte, encoding=encoding)

    return chosen_u
Пример #5
0
 def set_from_file(self, filename, encoding=None):
     if not encoding:
         encoding = decoder.detect_encoding(filename=filename)
     txt_u = codecs.open(filename, "rU", encoding).read()
     self.txt_u = txt_u
     return encoding
Пример #6
0
            print

    import sys

    text = Text()
    text.set_from_file(sys.argv[1])
    text.do_index()
    data = text.by_freq()
    print_by_freq(data)

    import fetcher

    url_u = u"http://www.dagbladet.no"
    text = Text()
    ret = fetcher.fetch(url_u)
    encoding = decoder.detect_encoding(ret.txt_byte)
    text.set_from_txt_byte(ret.txt_byte, encoding, untag=True)
    text.do_index()
    data = text.by_freq()
    print_by_freq(data)
    print (encoding)

    sys.exit()

    def out(dct, f):
        ws = dct.keys()
        ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower()))
        s = ""
        for w in ws:
            s += "%-6.6s  %s\n" % (dct[w].len_hits(), w)
        codecs.open(f, "w", "utf-8").write(s)
Пример #7
0
 def set_from_file(self, filename, encoding=None):
     if not encoding:
         encoding = decoder.detect_encoding(filename=filename)
     txt_u = codecs.open(filename, 'rU', encoding).read()
     self.txt_u = txt_u
     return encoding
Пример #8
0
            for h in w.get_hits():
                print h.get_pos(),
            print

    import sys
    text = Text()
    text.set_from_file(sys.argv[1])
    text.do_index()
    data = text.by_freq()
    print_by_freq(data)

    import fetcher
    url_u = u'http://www.dagbladet.no'
    text = Text()
    ret = fetcher.fetch(url_u)
    encoding = decoder.detect_encoding(ret.txt_byte)
    text.set_from_txt_byte(ret.txt_byte, encoding, untag=True)
    text.do_index()
    data = text.by_freq()
    print_by_freq(data)
    print(encoding)

    sys.exit()

    def out(dct, f):
        ws = dct.keys()
        ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower()))
        s = ''
        for w in ws:
            s += '%-6.6s  %s\n' % (dct[w].len_hits(), w)
        codecs.open(f, 'w', 'utf-8').write(s)