def text_from_file(self, filepath): if self.options.pdf_input: return pdf2text(filepath) elif 's3://' in filepath: return self.s3open(filepath) else: return codecs.open(filepath, 'r', 'utf-8').read()
def mapper(self, _, filepath): filepath = filepath.strip('\n').strip() filetext = pdf2text(filepath) if not filetext.strip(): yield None, filepath