示例#1
0
def az_text_extractor(mimetype, doc_source):
    if mimetype == 'text/html':
        doc = lxml.html.fromstring(doc_source)
        text = doc.xpath('//div[@class="Section2"]')[0].text_content()
        return text
    else:
        return text_after_line_numbers(pdfdata_to_text(doc_source))
示例#2
0
def fl_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    pre = doc.xpath('//pre')
    if pre:
        text = pre[0].text_content().encode('ascii', 'replace')
        return text_after_line_numbers(text)
    else:
        return '\n'.join(x.text_content() for x in doc.xpath('//tr/td[2]'))
示例#3
0
def ar_text_extractor(doc_source):
    return text_after_line_numbers(pdfdata_to_text(doc_source))
示例#4
0
def in_text_extractor(doc_source):
    text = pdfdata_to_text(doc_source)
    return text_after_line_numbers(text)
示例#5
0
def ia_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//pre')[0].text_content()
    # strip two sets of line numbers
    return text_after_line_numbers(text_after_line_numbers(text))
示例#6
0
def ak_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//pre')[0].text_content()
    text = text_after_line_numbers(text)
    return text
示例#7
0
def wi_text_extractor(mimetype, url, data):
    is_pdf = (mimetype == 'application/pdf' or
              url.endswith('.pdf'))
    if is_pdf:
        return text_after_line_numbers(pdfdata_to_text(data))
示例#8
0
def ut_text_extractor(mimetype, data):
    if mimetype == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(data))
示例#9
0
def mo_text_extractor(doc_source):
    text = pdfdata_to_text(doc_source)
    return text_after_line_numbers(text).encode('ascii', 'ignore')
示例#10
0
def hi_text_extractor(mimetype, doc_source):
    if mimetype == 'application/pdf':
        return text_after_line_numbers(pdfdata_to_text(doc_source))
    else:
        return None