def store(self, name, content, overwrite=False): t = self.d / name assert overwrite or not t.exists(), name + ' already exists!' with file(t, 'wb') as f: content = force_unicode(content) content = content.encode('utf8') f.write(content) f.write('\n') # new line at end of file return content
def note_template(self, x): x = unicodify_dict(x) others = set(x) - set( 'title author year source cached tags notes'.split()) attrs = u'\n'.join( (u':%s: %s' % (k, x[k])).strip() for k in others).strip() if attrs: attrs += '\n' newdata = TEMPLATE.format(attrs=attrs, **x) return force_unicode(newdata).encode('utf8')
def robust_read_string(x, verbose=0): detector = UniversalDetector() #for line in StringIO(x): detector.feed(x) #if detector.done: # break detector.close() if verbose: print 'encoding:', detector.result encoding = detector.result['encoding'] or 'utf8' return force_unicode(x.decode(encoding, 'replace').encode('utf8'))
def robust_read(filename, verbose=0): detector = UniversalDetector() for line in file(filename): detector.feed(line) if detector.done: break detector.close() if verbose: print 'encoding:', detector.result encoding = detector.result['encoding'] or 'utf8' with file(filename) as f: return force_unicode(f.read().decode(encoding, 'replace').encode('utf8'))
def extract_plaintext(self): "Extract plaintext from filename. Returns text, might cache." if self.cached.endswith('.pdf'): # extract text from pdfs text = pdftotext(self.cached, output=self.d / 'data' / 'pdftotext.txt', verbose=True, usecached=True) else: text = robust_read(self.cached) text = force_unicode(text) text = htmltotext(text) # clean up html text = remove_ligatures(text) return self.store('data/text', text, overwrite=True)
def __init__(self, raw): self._raw = raw self.raw = force_unicode(raw.strip()) self.styles = {} bibliography = bibtex.Parser().parse_stream(StringIO(self.raw)) entries = bibliography.entries assert len(entries) == 1, 'Entry is supposed to represent only one BibTex entry.' self.key, self.entry = entries.items()[0] self.fields = self.entry.fields for role, people in self.entry.persons.items(): self.fields[role] = people assert len(self.entry.persons) <= 2, 'ERROR: too people.' for k in self.fields: fields[k].append(self)
def __init__(self, raw): self._raw = raw self.raw = force_unicode(raw.strip()) self.styles = {} bibliography = bibtex.Parser().parse_stream(StringIO(self.raw)) entries = bibliography.entries assert len(entries) == 1, 'Entry is supposed to represent only one BibTex entry.' self.key, self.entry = list(entries.items())[0] self.fields = self.entry.fields for role, people in list(self.entry.persons.items()): self.fields[role] = people assert len(self.entry.persons) <= 2, 'ERROR: too people.' for k in self.fields: fields[k].append(self)
def uni(x): if isinstance(x, list): return map(uni, x) assert isinstance(x, basestring), x return force_unicode(x)