Exemplo n.º 1
0
def get_text(html):
    data,page = html
    ud = UniversalDetector()
    ud.feed(data)
    ud.close()
    encoding = ud.result['encoding']
    data = unicode(data, encoding)
    return data
Exemplo n.º 2
0
def get_authors_title_test():
    import urllib
    l = 'http://lib.ru/TXT/ruscience.txt'
    page = urllib.urlopen(l+'_Ascii.txt')
    text = page.read(2048)
    ud = UniversalDetector()
    ud.feed(text)
    ud.close()
    encoding = ud.result['encoding']
    text = unicode(text, encoding)
    authors, title = Retriever.get_authors_and_title(text)
    assert len(authors) == 1
    assert authors[0] == u'Дмитрий Толмацкий'
    assert title == u'Российская наука на пути из реанимации в морг'
#    print 'authors', ",".join( [author.encode('utf8') for author in authors ] )
#    print 'title',title
    pass
Exemplo n.º 3
0
 def detectFileEncode(self, filePath):
     detector = UniversalDetector()
     with open(filePath, 'r') as fp:
         for line in fp.readlines():
             detector.feed(line)
             if detector.done: break
         detector.close()
     print detector.result
     return detector.result['encoding']