def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: # This is a RTF file return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) doctype = None if doctype: log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode)
def process_file(filepath, field_filter_mode=None): """ decides which of the process_* functions to call """ if olefile.isOleFile(filepath): log.debug('Is OLE. Checking streams to see whether this is xls') if xls_parser.is_xls(filepath): log.debug('Process file as excel 2003 (xls)') return process_xls(filepath) else: log.debug('Process file as word 2003 (doc)') return process_doc(filepath) with open(filepath, 'rb') as file_handle: if file_handle.read(4) == RTF_START: log.debug('Process file as rtf') return process_rtf(file_handle, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) except Exception as exc: log.debug('Exception trying to xml-parse file: {0}'.format(exc)) doctype = None if doctype == ooxml.DOCTYPE_EXCEL: log.debug('Process file as excel 2007+ (xlsx)') return process_xlsx(filepath) elif doctype is None: log.debug('Process file as csv') return process_csv(filepath) else: # could be docx; if not: this is the old default code path log.debug('Process file as word 2007+ (docx)') return process_docx(filepath, field_filter_mode)
def test_all_rough(self): """Checks all samples, expect either ole files or good ooxml output""" acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ ooxml.DOCTYPE_POWERPOINT # files that are neither OLE nor xml: except_files = 'empty', 'text' except_extns = '.xml', '.rtf', '.csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: if self.DO_DEBUG: print('skip file: ' + filename) continue if splitext(filename)[1] in except_extns: if self.DO_DEBUG: print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): if self.DO_DEBUG: print('skip ole: ' + filename) continue try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue(doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable' .format(doctype, full_name)) if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype))
def test_rough_doctype(self): """Checks all samples, expect either ole files or good ooxml output""" # map from extension to expected doctype ext2doc = dict( docx=ooxml.DOCTYPE_WORD, docm=ooxml.DOCTYPE_WORD, dotx=ooxml.DOCTYPE_WORD, dotm=ooxml.DOCTYPE_WORD, xml=(ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_WORD_XML), xlsx=ooxml.DOCTYPE_EXCEL, xlsm=ooxml.DOCTYPE_EXCEL, xlsb=ooxml.DOCTYPE_EXCEL, xlam=ooxml.DOCTYPE_EXCEL, xltx=ooxml.DOCTYPE_EXCEL, xltm=ooxml.DOCTYPE_EXCEL, pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT, ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT, potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT, ) # files that are neither OLE nor xml: except_files = 'empty', 'text' except_extns = 'rtf', 'csv' # analyse all files in data dir for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: if self.DO_DEBUG: print('skip file: ' + filename) continue extn = splitext(filename)[1] if extn: extn = extn[1:] # remove the dot if extn in except_extns: if self.DO_DEBUG: print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): if self.DO_DEBUG: print('skip ole: ' + filename) continue acceptable = ext2doc[extn] if not isinstance(acceptable, tuple): acceptable = (acceptable, ) try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue( doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable'.format( doctype, full_name)) if self.DO_DEBUG: print('ok: {0} --> {1}'.format(filename, doctype))
def test_all_rough(self): """Checks all samples, expect either ole files or good ooxml output""" acceptable = ooxml.DOCTYPE_EXCEL, ooxml.DOCTYPE_WORD, \ ooxml.DOCTYPE_POWERPOINT except_files = 'empty', 'text' except_extns = '.xml', '.rtf' for base_dir, _, files in os.walk(DATA_BASE_DIR): for filename in files: if filename in except_files: #print('skip file: ' + filename) continue if splitext(filename)[1] in except_extns: #print('skip extn: ' + filename) continue full_name = join(base_dir, filename) if isOleFile(full_name): #print('skip ole: ' + filename) continue try: doctype = ooxml.get_type(full_name) except Exception: self.fail('Failed to get doctype of {0}'.format(filename)) self.assertTrue(doctype in acceptable, msg='Doctype "{0}" for {1} not acceptable' .format(doctype, full_name))
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: return process_docx(filepath, field_filter_mode)
def test(*filenames): """ parse all given file names and print rough structure """ logging.basicConfig(level=logging.DEBUG) if not filenames: logging.info('need file name[s]') return 2 for filename in filenames: logging.info('checking file {0}'.format(filename)) if not olefile.isOleFile(filename): logging.info('not an ole file - skip') continue xls = XlsFile(filename) for stream in xls.get_streams(): logging.info(stream) if isinstance(stream, WorkbookStream): for record in stream.iter_records(): logging.info(' {0}'.format(record)) return 0
def test(*filenames): """ parse all given file names and print rough structure """ logging.basicConfig(level=logging.DEBUG) if not filenames: logging.info('need file name[s]') return 2 for filename in filenames: logging.info('checking file {0}'.format(filename)) if not olefile.isOleFile(filename): logging.info('not an ole file - skip') continue xls = XlsFile(filename) for stream in xls.get_streams(): logging.info(stream) if isinstance(stream, WorkbookStream): for record in stream.iter_records(): logging.info(' {0}'.format(record)) return 0
def check(self): # check if it is actually an OLE file: oleformat = Indicator('ole_format', True, name='OLE format') self.indicators.append(oleformat) if not olefile.isOleFile(self.filename): oleformat.value = False return self.indicators # parse file: self.ole = olefile.OleFileIO(self.filename) # checks: self.check_properties() self.check_encrypted() self.check_word() self.check_excel() self.check_powerpoint() self.check_visio() self.check_ObjectPool() self.check_flash() self.ole.close() return self.indicators
def check(self): # check if it is actually an OLE file: oleformat = Indicator('ole_format', True, name='OLE format') self.indicators.append(oleformat) if not olefile.isOleFile(self.filename): oleformat.value = False return self.indicators # parse file: self.ole = olefile.OleFileIO(self.filename) # checks: self.check_properties() self.check_encrypted() self.check_word() self.check_excel() self.check_powerpoint() self.check_visio() self.check_ObjectPool() self.check_flash() self.ole.close() return self.indicators
def process_file(filepath, field_filter_mode=None): """ decides which of process_doc/x or process_xls/x to call """ if olefile.isOleFile(filepath): log.debug('checking streams to see whether this is xls') if xls_parser.is_xls(filepath): return process_xls(filepath) else: return process_doc(filepath) elif open(filepath, 'rb').read(4) == b'{\\rt': # This is a RTF file return process_rtf(filepath, field_filter_mode) try: doctype = ooxml.get_type(filepath) log.debug('Detected file type: {0}'.format(doctype)) if doctype == ooxml.DOCTYPE_EXCEL: return process_xlsx(filepath, field_filter_mode) else: return process_docx(filepath, field_filter_mode) except Exception: log.debug('Exception trying to xml-parse file', exc_info=True) return process_docx(filepath, field_filter_mode)
def test(filenames, ole_file_class=OleRecordFile, must_parse=None, do_per_record=None, verbose=False): """ parse all given file names and print rough structure if an error occurs while parsing a stream of type in must_parse, the error will be raised. Otherwise a message is printed """ logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) if do_per_record is None: def do_per_record(record): # pylint: disable=function-redefined pass # do nothing if not filenames: logging.info('need file name[s]') return 2 for filename in filenames: logging.info('checking file {0}'.format(filename)) if not olefile.isOleFile(filename): logging.info('not an ole file - skip') continue ole = ole_file_class(filename) for stream in ole.iter_streams(): logging.info(' parse ' + str(stream)) try: for record in stream.iter_records(): logging.info(' ' + str(record)) do_per_record(record) except Exception: if not must_parse: raise elif isinstance(stream, must_parse): raise else: logging.info(' failed to parse', exc_info=True) return 0
def find_ole(filename, data): """ try to open somehow as zip/ole/rtf/... ; yield None if fail If data is given, filename is (mostly) ignored. yields embedded ole streams in form of OleFileIO. """ if data is not None: # isOleFile and is_ppt can work on data directly but zip need file # --> wrap data in a file-like object without copying data log.debug('working on data, file is not touched below') arg_for_ole = data arg_for_zip = FakeFile(data) else: # we only have a file name log.debug('working on file by name') arg_for_ole = filename arg_for_zip = filename ole = None try: if olefile.isOleFile(arg_for_ole): if is_ppt(arg_for_ole): log.info('is ppt file: ' + filename) for ole in find_ole_in_ppt(arg_for_ole): yield ole ole = None # is closed in find_ole_in_ppt # in any case: check for embedded stuff in non-sectored streams log.info('is ole file: ' + filename) ole = olefile.OleFileIO(arg_for_ole) yield ole elif is_zipfile(arg_for_zip): log.info('is zip file: ' + filename) zipper = ZipFile(arg_for_zip, 'r') for subfile in zipper.namelist(): head = b'' try: with zipper.open(subfile) as file_handle: head = file_handle.read(len(olefile.MAGIC)) except RuntimeError: log.error('zip is encrypted: ' + filename) yield None continue if head == olefile.MAGIC: log.info(' unzipping ole: ' + subfile) with ZipSubFile(zipper, subfile) as file_handle: try: ole = olefile.OleFileIO(file_handle) yield ole except IOError: log.warning('Error reading data from {0}/{1} or ' 'interpreting it as OLE object'.format( filename, subfile)) log.debug('', exc_info=True) finally: if ole is not None: ole.close() ole = None else: log.debug('unzip skip: ' + subfile) else: log.warning( 'open failed: {0} (or its data) is neither zip nor OLE'.format( filename)) yield None except Exception: log.error('Caught exception opening {0}'.format(filename), exc_info=True) yield None finally: if ole is not None: ole.close()
def process_file(filepath, field_filter_mode=None): """ decides to either call process_openxml or process_ole """ if olefile.isOleFile(filepath): return process_ole(filepath) else: return process_openxml(filepath, field_filter_mode)
def process_file(filepath): """ decides to either call process_openxml or process_ole """ if olefile.isOleFile(filepath): return process_ole(filepath) else: return process_openxml(filepath)