def az_text_extractor(mimetype, doc_source): if mimetype == 'text/html': doc = lxml.html.fromstring(doc_source) text = doc.xpath('//div[@class="Section2"]')[0].text_content() return text else: return text_after_line_numbers(pdfdata_to_text(doc_source))
def fl_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) pre = doc.xpath('//pre') if pre: text = pre[0].text_content().encode('ascii', 'replace') return text_after_line_numbers(text) else: return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
def ar_text_extractor(doc_source): return text_after_line_numbers(pdfdata_to_text(doc_source))
def in_text_extractor(doc_source): text = pdfdata_to_text(doc_source) return text_after_line_numbers(text)
def ia_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) text = doc.xpath('//pre')[0].text_content() # strip two sets of line numbers return text_after_line_numbers(text_after_line_numbers(text))
def ak_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) text = doc.xpath('//pre')[0].text_content() text = text_after_line_numbers(text) return text
def wi_text_extractor(mimetype, url, data): is_pdf = (mimetype == 'application/pdf' or url.endswith('.pdf')) if is_pdf: return text_after_line_numbers(pdfdata_to_text(data))
def ut_text_extractor(mimetype, data): if mimetype == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(data))
def mo_text_extractor(doc_source): text = pdfdata_to_text(doc_source) return text_after_line_numbers(text).encode('ascii', 'ignore')
def hi_text_extractor(mimetype, doc_source): if mimetype == 'application/pdf': return text_after_line_numbers(pdfdata_to_text(doc_source)) else: return None