Пример #1
0
 def note_template(self, x):
     others = set(x) - set('title author year source cached tags notes'.split())
     attrs = u'\n'.join((u':%s: %s' % (k, x[k])).strip() for k in others).strip()
     if attrs:
         attrs += '\n'
     newdata = TEMPLATE.format(attrs=attrs, **x)
     return force_unicode(newdata).encode('utf8')
Пример #2
0
 def store(self, name, content, overwrite=False):
     t = self.d / name
     assert overwrite or not t.exists(), name + ' already exists!'
     with file(t, 'wb') as f:
         content = force_unicode(content)
         content = content.encode('utf8')
         f.write(content)
         f.write('\n')    # new line at end of file
     return content
Пример #3
0
def robust_read_string(x, verbose=0):
    detector = UniversalDetector()
    #for line in StringIO(x):
    detector.feed(x)
    #if detector.done:
    #    break
    detector.close()
    if verbose:
        print 'encoding:', detector.result
    encoding = detector.result['encoding'] or 'utf8'
    return force_unicode(x.decode(encoding, 'replace').encode('utf8'))
Пример #4
0
def robust_read(filename, verbose=0):
    detector = UniversalDetector()
    for line in file(filename):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    if verbose:
        print 'encoding:', detector.result
    encoding = detector.result['encoding'] or 'utf8'
    with file(filename) as f:
        return force_unicode(f.read().decode(encoding, 'replace').encode('utf8'))
Пример #5
0
    def extract_plaintext(self):
        "Extract plaintext from filename. Returns text, might cache."

        if self.cached.endswith('.pdf'):
            # extract text from pdfs
            text = pdftotext(self.cached, output=self.d / 'data' / 'pdftotext.txt',
                             verbose=True, usecached=True)

        else:
            text = robust_read(self.cached)
            text = force_unicode(text)
            text = htmltotext(text)      # clean up html

        text = remove_ligatures(text)

        return self.store('data/text', text, overwrite=True)
Пример #6
0
def uni(x):
    if isinstance(x, list):
        return map(uni, x)
    assert isinstance(x, basestring), x
    return force_unicode(x)