Exemplo n.º 1
0
def html(from_file, to_txt, opts=None):
    log.debug('html2txt starting')

    h = html2text.HTML2Text()
    h.split_next_td = False
    h.td_count = 0
    h.table_start = False
    h.unicode_snob = 0
    h.escape_snob = 0
    h.links_each_paragraph = 0
    h.body_width = 78
    h.skip_internal_links = True
    h.inline_links = True
    h.protect_links = True
    h.ignore_links = True
    h.ignore_images = True
    h.images_to_alt = True
    h.ignore_emphasis = True
    h.bypass_tables = 1
    h.google_doc = False
    h.ul_item_mark = '*'
    h.emphasis_mark = '_'
    h.strong_mark = '**'
    h.single_line_break = True

    _encoding = find_encoding(from_file.path)
    html = raw_data(from_file.path, _encoding)
    if not html:
        return
    text = h.handle(html)
    return save_raw_data(to_txt.path, text, _encoding)
Exemplo n.º 2
0
def txt(from_file, to_txt, opts):
    """This function serves as example
    """
    log.debug('txt2txt starting')
    _encoding = find_encoding(from_file.path)
    text = raw_data(from_file.path, _encoding)
    return save_raw_data(to_txt.path, text, _encoding)
Exemplo n.º 3
0
def odt(from_file, to_txt, opts):
    content = None
    with open(from_file.path) as stream:
        zip_stream = zipfile.ZipFile(stream)
        content = ET.fromstring(zip_stream.read("content.xml"))

    buff = u""
    for child in content.iter():
        if child.tag in [qn('text:p'), qn('text:h')]:
            buff += text_to_string(child) + "\n"
    if buff:
        buff = buff[:-1]
    return save_raw_data(to_txt.path, buff)
Exemplo n.º 4
0
def xlsx(from_file, to_txt, opts):
    filename = from_file.path
    workbook = xlrd.open_workbook(filename)
    sheets_name = workbook.sheet_names()
    output = os.linesep
    for names in sheets_name:
        worksheet = workbook.sheet_by_name(names)
        num_rows = worksheet.nrows
        num_cells = worksheet.ncols

        for curr_row in range(num_rows):
            row = worksheet.row(curr_row)
            new_output = []
            for index_col in xrange(num_cells):
                value = worksheet.cell_value(curr_row, index_col)
                if value:
                    if isinstance(value, (int, float)):
                        value = unicode(value)
                    new_output.append(value)
            if new_output:
                output += u' '.join(new_output) + unicode(os.linesep)

    return save_raw_data(to_txt.path, output)
Exemplo n.º 5
0
    try:
        if path and to_txt.size() <= 1000:
            if not opts['--ocr'] and not opts['--without-ocr']:
                log.info('OCR running. The last output text file is suspicious and almost empty') 
                path = pdf_ocr(from_file, to_txt, opts)
    except: 
        log.critical('IGNORED')


def pdf_ocr(from_file, to_txt, opts):
    pdftopng(from_file.path, to_txt.path)
    text = []
    outputpath = os.path.join(to_txt.dirname, 'output.txt')
    regex = re.compile('.*png$')
    raw = None
    for _, _, files in walk(to_txt.dirname, regex=regex):
        for f in files:
            if (f.name).startswith(to_txt.basename):
                log.info('tesseract is processing:')
                log.info(f.path)
                tesseract(f.path, None, opts)
                try:
                    raw = raw_data(outputpath)
                except Exception, e:
                    log.critical('pdf_ocr: %s' % e)
                text.append(raw)
                remove(f.path)
    remove(outputpath)
    if text:
        return save_raw_data(to_txt.path, text)
Exemplo n.º 6
0
def rtf(from_file, to_txt, opts):
    doc = Rtf15Reader.read(open(from_file.path, "rb"))
    text = PlaintextWriter.write(doc).getvalue()
    return save_raw_data(to_txt.path, text)
Exemplo n.º 7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Jonathan S. Prieto
# @Date:   2015-03-16 01:53:06
# @Last Modified by:   Jonathan Prieto
# @Last Modified time: 2015-07-15 10:14:21
from _utils import save_raw_data
from atxt.log_conf import Logger
import docx

log = Logger.log

__all__ = ['qdocx']


def qdocx(from_file, to_txt, opts):
    log.debug('docx2txt starting')
    try:
        doc = docx.opendocx(from_file.path)
    except Exception, e:
        log.critical(e)
    text = [line for line in docx.getdocumenttext(doc)]
    return save_raw_data(to_txt.path, text, encoding='utf-8')