def note_template(self, x): others = set(x) - set('title author year source cached tags notes'.split()) attrs = u'\n'.join((u':%s: %s' % (k, x[k])).strip() for k in others).strip() if attrs: attrs += '\n' newdata = TEMPLATE.format(attrs=attrs, **x) return force_unicode(newdata).encode('utf8')
def store(self, name, content, overwrite=False): t = self.d / name assert overwrite or not t.exists(), name + ' already exists!' with file(t, 'wb') as f: content = force_unicode(content) content = content.encode('utf8') f.write(content) f.write('\n') # new line at end of file return content
def robust_read_string(x, verbose=0): detector = UniversalDetector() #for line in StringIO(x): detector.feed(x) #if detector.done: # break detector.close() if verbose: print 'encoding:', detector.result encoding = detector.result['encoding'] or 'utf8' return force_unicode(x.decode(encoding, 'replace').encode('utf8'))
def robust_read(filename, verbose=0): detector = UniversalDetector() for line in file(filename): detector.feed(line) if detector.done: break detector.close() if verbose: print 'encoding:', detector.result encoding = detector.result['encoding'] or 'utf8' with file(filename) as f: return force_unicode(f.read().decode(encoding, 'replace').encode('utf8'))
def extract_plaintext(self): "Extract plaintext from filename. Returns text, might cache." if self.cached.endswith('.pdf'): # extract text from pdfs text = pdftotext(self.cached, output=self.d / 'data' / 'pdftotext.txt', verbose=True, usecached=True) else: text = robust_read(self.cached) text = force_unicode(text) text = htmltotext(text) # clean up html text = remove_ligatures(text) return self.store('data/text', text, overwrite=True)
def uni(x): if isinstance(x, list): return map(uni, x) assert isinstance(x, basestring), x return force_unicode(x)