def html(from_file, to_txt, opts=None): log.debug('html2txt starting') h = html2text.HTML2Text() h.split_next_td = False h.td_count = 0 h.table_start = False h.unicode_snob = 0 h.escape_snob = 0 h.links_each_paragraph = 0 h.body_width = 78 h.skip_internal_links = True h.inline_links = True h.protect_links = True h.ignore_links = True h.ignore_images = True h.images_to_alt = True h.ignore_emphasis = True h.bypass_tables = 1 h.google_doc = False h.ul_item_mark = '*' h.emphasis_mark = '_' h.strong_mark = '**' h.single_line_break = True _encoding = find_encoding(from_file.path) html = raw_data(from_file.path, _encoding) if not html: return text = h.handle(html) return save_raw_data(to_txt.path, text, _encoding)
def txt(from_file, to_txt, opts): """This function serves as example """ log.debug('txt2txt starting') _encoding = find_encoding(from_file.path) text = raw_data(from_file.path, _encoding) return save_raw_data(to_txt.path, text, _encoding)
def odt(from_file, to_txt, opts): content = None with open(from_file.path) as stream: zip_stream = zipfile.ZipFile(stream) content = ET.fromstring(zip_stream.read("content.xml")) buff = u"" for child in content.iter(): if child.tag in [qn('text:p'), qn('text:h')]: buff += text_to_string(child) + "\n" if buff: buff = buff[:-1] return save_raw_data(to_txt.path, buff)
def xlsx(from_file, to_txt, opts): filename = from_file.path workbook = xlrd.open_workbook(filename) sheets_name = workbook.sheet_names() output = os.linesep for names in sheets_name: worksheet = workbook.sheet_by_name(names) num_rows = worksheet.nrows num_cells = worksheet.ncols for curr_row in range(num_rows): row = worksheet.row(curr_row) new_output = [] for index_col in xrange(num_cells): value = worksheet.cell_value(curr_row, index_col) if value: if isinstance(value, (int, float)): value = unicode(value) new_output.append(value) if new_output: output += u' '.join(new_output) + unicode(os.linesep) return save_raw_data(to_txt.path, output)
try: if path and to_txt.size() <= 1000: if not opts['--ocr'] and not opts['--without-ocr']: log.info('OCR running. The last output text file is suspicious and almost empty') path = pdf_ocr(from_file, to_txt, opts) except: log.critical('IGNORED') def pdf_ocr(from_file, to_txt, opts): pdftopng(from_file.path, to_txt.path) text = [] outputpath = os.path.join(to_txt.dirname, 'output.txt') regex = re.compile('.*png$') raw = None for _, _, files in walk(to_txt.dirname, regex=regex): for f in files: if (f.name).startswith(to_txt.basename): log.info('tesseract is processing:') log.info(f.path) tesseract(f.path, None, opts) try: raw = raw_data(outputpath) except Exception, e: log.critical('pdf_ocr: %s' % e) text.append(raw) remove(f.path) remove(outputpath) if text: return save_raw_data(to_txt.path, text)
def rtf(from_file, to_txt, opts): doc = Rtf15Reader.read(open(from_file.path, "rb")) text = PlaintextWriter.write(doc).getvalue() return save_raw_data(to_txt.path, text)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Jonathan S. Prieto # @Date: 2015-03-16 01:53:06 # @Last Modified by: Jonathan Prieto # @Last Modified time: 2015-07-15 10:14:21 from _utils import save_raw_data from atxt.log_conf import Logger import docx log = Logger.log __all__ = ['qdocx'] def qdocx(from_file, to_txt, opts): log.debug('docx2txt starting') try: doc = docx.opendocx(from_file.path) except Exception, e: log.critical(e) text = [line for line in docx.getdocumenttext(doc)] return save_raw_data(to_txt.path, text, encoding='utf-8')